library(tidyverse)
library(knitr)
library(janitor)
library("readxl")
library(ggfortify)
library(GGally)
library(qtlcharts)
library(leaps)
<<<<<<< HEAD
library(sjPlot)
======= library(sjPlot) library(pheatmap) >>>>>>> 62e0a8412e27ea0e3d0247c40f04dfa9ef0c7a0d

1. Introduction

The data set is adapted from th

1.1 Sampling method and potential biases

<<<<<<< HEAD

1.2 Data Import, Processing and Cleaning

Data Processing and Cleaning

data = read_tsv("bodyfat.txt", na = c("NA"))
data = data %>% janitor::clean_names()
#glimpse(data)

#Introduce BMI Variable
data = data %>%
  mutate(bmi = (data$weight/(data$height ^ 2)) * 703)
#Isolate the dataset only contain Body Measurements
data_bf = data[-c(1,3:5,17)]
#Isolate the dataset to only contain Body Measurements and as weight and height were included in the BMI formula, it is also removed
data_bmi = data[-c(1:5)]
=======

1.2 Data import, processing and cleaning

data = read_tsv("bodyfat.txt", na = c("NA"))
data = data %>% janitor::clean_names()
glimpse(data)
## Observations: 250
## Variables: 16
## $ density <dbl> 1.0708, 1.0853, 1.0414, 1.0751, 1.0340, 1.0502, 1.0549,…
## $ pct_bf  <dbl> 12.3, 6.1, 25.3, 10.4, 28.7, 20.9, 19.2, 12.4, 4.1, 11.…
## $ age     <dbl> 23, 22, 22, 26, 24, 24, 26, 25, 25, 23, 26, 27, 32, 30,…
## $ weight  <dbl> 154.25, 173.25, 154.00, 184.75, 184.25, 210.25, 181.00,…
## $ height  <dbl> 67.75, 72.25, 66.25, 72.25, 71.25, 74.75, 69.75, 72.50,…
## $ neck    <dbl> 36.2, 38.5, 34.0, 37.4, 34.4, 39.0, 36.4, 37.8, 38.1, 4…
## $ chest   <dbl> 93.1, 93.6, 95.8, 101.8, 97.3, 104.5, 105.1, 99.6, 100.…
## $ abdomen <dbl> 85.2, 83.0, 87.9, 86.4, 100.0, 94.4, 90.7, 88.5, 82.5, …
## $ waist   <dbl> 33.54331, 32.67717, 34.60630, 34.01575, 39.37008, 37.16…
## $ hip     <dbl> 94.5, 98.7, 99.2, 101.2, 101.9, 107.8, 100.3, 97.1, 99.…
## $ thigh   <dbl> 59.0, 58.7, 59.6, 60.1, 63.2, 66.0, 58.4, 60.0, 62.9, 6…
## $ knee    <dbl> 37.3, 37.3, 38.9, 37.3, 42.2, 42.0, 38.3, 39.4, 38.3, 4…
## $ ankle   <dbl> 21.9, 23.4, 24.0, 22.8, 24.0, 25.6, 22.9, 23.2, 23.8, 2…
## $ bicep   <dbl> 32.0, 30.5, 28.8, 32.4, 32.2, 35.7, 31.9, 30.5, 35.9, 3…
## $ forearm <dbl> 27.4, 28.9, 25.2, 29.4, 27.7, 30.6, 27.8, 29.0, 31.1, 3…
## $ wrist   <dbl> 17.1, 18.2, 16.6, 18.2, 17.7, 18.8, 17.7, 18.8, 18.2, 1…
>>>>>>> 62e0a8412e27ea0e3d0247c40f04dfa9ef0c7a0d

2. Analysis

2.1 Body Fat

<<<<<<< HEAD =======

provide context to the qn!!! why use linear regression on full model…

>>>>>>> 62e0a8412e27ea0e3d0247c40f04dfa9ef0c7a0d
2.1.1 Defining the model with population parameters

\[ Percentage of Body Fat = \beta_0 + \beta_1density + \beta_2age + \beta_3weight + \beta_4height\\ + \beta_5neck + \beta_6chest + \beta_7abdomen + \beta_8waist + \beta_9hip + \beta_{10}thigh\\ + \beta_{11}knee + \beta_{12}ankle + \beta_{13}bicep + \beta_{14}forearm + \beta_{15}wrist + \epsilon \]

2.1.2 Linear regression assumptions for the full model

The residuals \(\epsilon_i\) are iid \(N(0,\sigma^2)\) and there is a linear relationship between y and x.

pbf_lm = lm(pct_bf ~ ., data)
summary(pbf_lm)
## 
## Call:
## lm(formula = pct_bf ~ ., data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
<<<<<<< HEAD
## -8.3099 -0.3662 -0.1243  0.2381 14.9733 
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.675e+02  2.052e+01  22.779   <2e-16 ***
## density     -4.113e+02  8.501e+00 -48.377   <2e-16 ***
## age          1.300e-02  9.760e-03   1.332    0.184    
## weight       6.090e-02  4.745e-02   1.283    0.201    
## height      -2.633e-01  2.402e-01  -1.096    0.274    
## neck        -1.522e-02  7.083e-02  -0.215    0.830    
## chest        2.362e-02  3.302e-02   0.715    0.475    
## abdomen      2.029e-02  3.291e-02   0.617    0.538    
## waist               NA         NA      NA       NA    
## hip          3.791e-02  4.546e-02   0.834    0.405    
## thigh       -2.401e-02  4.429e-02  -0.542    0.588    
## knee        -2.816e-02  7.444e-02  -0.378    0.706    
## ankle       -7.545e-02  6.639e-02  -1.136    0.257    
## bicep       -5.445e-02  5.134e-02  -1.061    0.290    
## forearm      1.989e-02  6.251e-02   0.318    0.751    
## wrist       -3.152e-03  1.646e-01  -0.019    0.985    
## bmi         -3.587e-01  3.377e-01  -1.062    0.289    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.276 on 234 degrees of freedom
## Multiple R-squared:  0.9778, Adjusted R-squared:  0.9763 
## F-statistic: 685.9 on 15 and 234 DF,  p-value: < 2.2e-16
autoplot(pbf_lm, which = 1:2) + theme_bw()
======= ## -8.3746 -0.3725 -0.1157 0.2358 15.0629 ## ## Coefficients: (1 not defined because of singularities) ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 4.494e+02 1.154e+01 38.961 <2e-16 *** ## density -4.098e+02 8.384e+00 -48.876 <2e-16 *** ## age 1.395e-02 9.721e-03 1.435 0.153 ## weight 1.527e-02 2.015e-02 0.758 0.449 ## height -1.558e-02 5.752e-02 -0.271 0.787 ## neck -1.653e-02 7.084e-02 -0.233 0.816 ## chest 1.790e-02 3.259e-02 0.549 0.583 ## abdomen 1.833e-02 3.286e-02 0.558 0.578 ## waist NA NA NA NA ## hip 2.537e-02 4.391e-02 0.578 0.564 ## thigh -2.107e-02 4.421e-02 -0.476 0.634 ## knee -1.657e-02 7.366e-02 -0.225 0.822 ## ankle -8.160e-02 6.616e-02 -1.233 0.219 ## bicep -5.256e-02 5.132e-02 -1.024 0.307 ## forearm 1.405e-02 6.229e-02 0.225 0.822 ## wrist -1.883e-02 1.640e-01 -0.115 0.909 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 1.276 on 235 degrees of freedom ## Multiple R-squared: 0.9777, Adjusted R-squared: 0.9763 ## F-statistic: 734.4 on 14 and 235 DF, p-value: < 2.2e-16
autoplot(pbf_lm, which = 1:2) + theme_bw()
  1. Linearity: In the scatterplot above, .
  2. Homoskedasticity: In the scatterplot above, .
  3. Normality: In the QQ plot above, .
2.1.3 Dropping variables using the AIC starting from the full model
pbf_step = step(pbf_lm, direction = "backward")
## Start:  AIC=136.49
## pct_bf ~ density + age + weight + height + neck + chest + abdomen + 
##     waist + hip + thigh + knee + ankle + bicep + forearm + wrist
## 
## 
## Step:  AIC=136.49
## pct_bf ~ density + age + weight + height + neck + chest + abdomen + 
##     hip + thigh + knee + ankle + bicep + forearm + wrist
## 
##           Df Sum of Sq    RSS    AIC
## - wrist    1       0.0  382.8 134.51
## - knee     1       0.1  382.9 134.55
## - forearm  1       0.1  382.9 134.55
## - neck     1       0.1  382.9 134.55
## - height   1       0.1  382.9 134.57
## - thigh    1       0.4  383.1 134.73
## - chest    1       0.5  383.3 134.81
## - abdomen  1       0.5  383.3 134.82
## - hip      1       0.5  383.3 134.85
## - weight   1       0.9  383.7 135.10
## - bicep    1       1.7  384.5 135.61
## - ankle    1       2.5  385.2 136.11
## <none>                  382.8 136.49
## - age      1       3.4  386.1 136.67
## - density  1    3891.0 4273.7 737.70
## 
## Step:  AIC=134.51
## pct_bf ~ density + age + weight + height + neck + chest + abdomen + 
##     hip + thigh + knee + ankle + bicep + forearm
## 
##           Df Sum of Sq    RSS    AIC
## - forearm  1       0.1  382.9 132.55
## - knee     1       0.1  382.9 132.56
## - height   1       0.1  382.9 132.58
## - neck     1       0.1  382.9 132.59
## - thigh    1       0.4  383.2 132.74
## - abdomen  1       0.5  383.3 132.84
## - chest    1       0.5  383.3 132.84
## - hip      1       0.5  383.3 132.86
## - weight   1       0.9  383.7 133.10
## - bicep    1       1.7  384.5 133.64
## - ankle    1       2.8  385.6 134.31
## <none>                  382.8 134.51
## - age      1       3.6  386.4 134.88
## - density  1    4097.8 4480.6 747.51
## 
## Step:  AIC=132.55
## pct_bf ~ density + age + weight + height + neck + chest + abdomen + 
##     hip + thigh + knee + ankle + bicep
## 
##           Df Sum of Sq    RSS    AIC
## - knee     1       0.1  382.9 130.61
## - neck     1       0.1  383.0 130.61
## - height   1       0.1  383.0 130.64
## - thigh    1       0.4  383.2 130.80
## - abdomen  1       0.5  383.3 130.86
## - chest    1       0.5  383.4 130.89
## - hip      1       0.5  383.4 130.90
## - weight   1       1.0  383.9 131.22
## - bicep    1       1.7  384.5 131.64
## - ankle    1       2.8  385.6 132.34
## <none>                  382.9 132.55
## - age      1       3.6  386.4 132.88
## - density  1    4112.5 4495.4 746.34
## 
## Step:  AIC=130.61
## pct_bf ~ density + age + weight + height + neck + chest + abdomen + 
##     hip + thigh + ankle + bicep
## 
##           Df Sum of Sq    RSS    AIC
## - neck     1       0.1  383.0 128.66
## - height   1       0.2  383.1 128.72
## - abdomen  1       0.5  383.4 128.92
## - hip      1       0.5  383.4 128.93
## - thigh    1       0.5  383.5 128.95
## - chest    1       0.5  383.5 128.95
## - weight   1       1.0  383.9 129.23
## - bicep    1       1.7  384.6 129.68
## <none>                  382.9 130.61
## - ankle    1       3.1  386.0 130.61
## - age      1       3.6  386.5 130.94
## - density  1    4114.8 4497.7 744.47
## 
## Step:  AIC=128.66
## pct_bf ~ density + age + weight + height + chest + abdomen + 
##     hip + thigh + ankle + bicep
## 
##           Df Sum of Sq    RSS    AIC
## - height   1       0.1  383.2 126.75
## - abdomen  1       0.5  383.5 126.96
## - chest    1       0.5  383.6 127.01
## - thigh    1       0.5  383.6 127.01
## - hip      1       0.6  383.6 127.07
## - weight   1       0.9  383.9 127.23
## - bicep    1       1.8  384.8 127.84
## - ankle    1       3.0  386.1 128.63
## <none>                  383.0 128.66
## - age      1       3.5  386.5 128.94
## - density  1    4238.1 4621.1 749.23
## 
## Step:  AIC=126.75
## pct_bf ~ density + age + weight + chest + abdomen + hip + thigh + 
##     ankle + bicep
## 
##           Df Sum of Sq    RSS    AIC
## - thigh    1       0.4  383.6 125.04
## - abdomen  1       0.6  383.8 125.17
## - hip      1       0.7  383.9 125.22
## - weight   1       0.9  384.1 125.35
## - chest    1       1.0  384.2 125.43
## - bicep    1       1.7  384.8 125.85
## - ankle    1       3.0  386.1 126.68
## <none>                  383.2 126.75
## - age      1       3.5  386.7 127.04
## - density  1    4258.1 4641.2 748.32
## 
## Step:  AIC=125.04
## pct_bf ~ density + age + weight + chest + abdomen + hip + ankle + 
##     bicep
## 
##           Df Sum of Sq    RSS    AIC
## - hip      1       0.4  384.0 123.30
## - abdomen  1       0.6  384.2 123.42
## - weight   1       0.8  384.4 123.55
## - chest    1       1.3  384.9 123.88
## - bicep    1       2.4  386.0 124.62
## - ankle    1       3.0  386.6 125.00
## <none>                  383.6 125.04
## - age      1       5.0  388.6 126.29
## - density  1    4304.4 4688.1 748.83
## 
## Step:  AIC=123.3
## pct_bf ~ density + age + weight + chest + abdomen + ankle + bicep
## 
##           Df Sum of Sq    RSS    AIC
## - chest    1       1.1  385.1 121.99
## - abdomen  1       1.1  385.1 122.00
## - weight   1       2.2  386.2 122.71
## - bicep    1       2.4  386.4 122.88
## - ankle    1       3.0  387.0 123.27
## <none>                  384.0 123.30
## - age      1       4.6  388.6 124.29
## - density  1    4304.1 4688.1 746.83
## 
## Step:  AIC=121.99
## pct_bf ~ density + age + weight + abdomen + ankle + bicep
## 
##           Df Sum of Sq    RSS    AIC
## - bicep    1       2.1  387.2 121.36
## - abdomen  1       2.5  387.5 121.59
## <none>                  385.1 121.99
## - ankle    1       3.2  388.3 122.07
## - weight   1       3.8  388.8 122.42
## - age      1       5.6  390.6 123.58
## - density  1    4313.7 4698.8 745.40
## 
## Step:  AIC=121.36
## pct_bf ~ density + age + weight + abdomen + ankle
## 
##           Df Sum of Sq    RSS    AIC
## - weight   1       2.0  389.1 120.62
## - ankle    1       3.0  390.2 121.29
## <none>                  387.2 121.36
## - abdomen  1       3.3  390.5 121.49
## - age      1       5.5  392.7 122.90
## - density  1    4360.2 4747.3 745.97
## 
## Step:  AIC=120.62
## pct_bf ~ density + age + abdomen + ankle
## 
##           Df Sum of Sq    RSS    AIC
## - ankle    1       1.5  390.7 119.60
## <none>                  389.1 120.62
## - age      1       3.8  392.9 121.03
## - abdomen  1      30.4  419.5 137.41
## - density  1    4847.5 5236.6 768.49
## 
## Step:  AIC=119.6
## pct_bf ~ density + age + abdomen
## 
##           Df Sum of Sq    RSS    AIC
## <none>                  390.7 119.60
## - age      1       5.1  395.8 120.85
## - abdomen  1      29.7  420.4 135.93
## - density  1    4952.5 5343.1 771.53

Backwards selection using the AIC dropped all variables except for age, density and abdomen which are kept in the model.

2.1.4 Fitted model for the model selected by the step-wise procedure.

\[ Percentage of Body Fat = 442.3755 - 406.493 \times density\\ + 0.0118 \times age + 0.0576 \times abdomen\\ \] Looking at the \(R^2\) value (multiple R-squared) from the summary output, 98% of the variability of age is explained by the regression on density, age and abdomen circumference.

  1. On average, holding the other variables constant, a 1 \(gm/cm^3\) increase in density leads to a 400 unit decrease in percentage of body fat.
  2. On average, holding the other variables constant, a one year increase in age leads to a 0.0118 unit increase in percentage of body fat.
  3. On average, holding the other variables constant, a one year increase in abdomen circumference leads to a 0.0576 unit increase in percentage of body fat.
#options("scipen"=100, "digits"=4)
summary(pbf_step)
## 
## Call:
## lm(formula = pct_bf ~ density + age + abdomen, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.2913 -0.3576 -0.0911  0.2319 15.4601 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.424e+02  8.738e+00  50.626  < 2e-16 ***
## density     -4.065e+02  7.279e+00 -55.844  < 2e-16 ***
## age          1.182e-02  6.579e-03   1.796   0.0737 .  
## abdomen      5.761e-02  1.332e-02   4.326 2.21e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.26 on 246 degrees of freedom
## Multiple R-squared:  0.9772, Adjusted R-squared:  0.9769 
## F-statistic:  3513 on 3 and 246 DF,  p-value: < 2.2e-16
#options("scipen"=-100, "digits"=4)
2.1.5 Linear regression assumptions for the stepwise model - why do this? same as previous??
autoplot(pbf_step, which = 1:2) + theme_bw()

2. Analysis

2.1 Prediction for Obesity

Due to the increasing consumptions of fast food and the increasing convenience of food deliveries, concerns about obesity level is rising throughput the world and has reached a new high. This increasing concern has lead to an increasing need to measure obesity accurately and percentage body fat is arguably the most accurate measure by far. However, the calculation of body fat is difficult and many has switched to Body Mass Index (BMI) for simpler calculation. This section is looking at comparing the results from predicting body fat percentage using other body measurements and predicting BMI using other body measurements to determine wh body measurement is the most important in determining obesity.

Data import, Processing and Cleaning

data = read.delim("bodyfat.txt") %>% janitor::clean_names()
glimpse(data)
## Observations: 250
## Variables: 16
## $ density <dbl> 1.0708, 1.0853, 1.0414, 1.0751, 1.0340, 1.0502, 1.0549,…
## $ pct_bf  <dbl> 12.3, 6.1, 25.3, 10.4, 28.7, 20.9, 19.2, 12.4, 4.1, 11.…
## $ age     <int> 23, 22, 22, 26, 24, 24, 26, 25, 25, 23, 26, 27, 32, 30,…
## $ weight  <dbl> 154.25, 173.25, 154.00, 184.75, 184.25, 210.25, 181.00,…
## $ height  <dbl> 67.75, 72.25, 66.25, 72.25, 71.25, 74.75, 69.75, 72.50,…
## $ neck    <dbl> 36.2, 38.5, 34.0, 37.4, 34.4, 39.0, 36.4, 37.8, 38.1, 4…
## $ chest   <dbl> 93.1, 93.6, 95.8, 101.8, 97.3, 104.5, 105.1, 99.6, 100.…
## $ abdomen <dbl> 85.2, 83.0, 87.9, 86.4, 100.0, 94.4, 90.7, 88.5, 82.5, …
## $ waist   <dbl> 33.54331, 32.67717, 34.60630, 34.01575, 39.37008, 37.16…
## $ hip     <dbl> 94.5, 98.7, 99.2, 101.2, 101.9, 107.8, 100.3, 97.1, 99.…
## $ thigh   <dbl> 59.0, 58.7, 59.6, 60.1, 63.2, 66.0, 58.4, 60.0, 62.9, 6…
## $ knee    <dbl> 37.3, 37.3, 38.9, 37.3, 42.2, 42.0, 38.3, 39.4, 38.3, 4…
## $ ankle   <dbl> 21.9, 23.4, 24.0, 22.8, 24.0, 25.6, 22.9, 23.2, 23.8, 2…
## $ bicep   <dbl> 32.0, 30.5, 28.8, 32.4, 32.2, 35.7, 31.9, 30.5, 35.9, 3…
## $ forearm <dbl> 27.4, 28.9, 25.2, 29.4, 27.7, 30.6, 27.8, 29.0, 31.1, 3…
## $ wrist   <dbl> 17.1, 18.2, 16.6, 18.2, 17.7, 18.8, 17.7, 18.8, 18.2, 1…
#Introduce BMI Varaible
data=data %>% mutate(bmi=(data$weight/(data$height^2))*703,)
#Isolate the dataset only contain Body Measurements
data_bf = data[-c(1,3:5,17)]
#Isolate the dataset to only contain Body Measurements and as weight and height were included in the BMI formula, it is also removed
data_bmi = data[-c(1:5)]

Body Fat Percentage

Data Visualisation
qtlcharts::iplotCorr(data_bf)
## Set screen size to height=700 x width=1000

Based on the interactive correlation matrix, it can be seen the level of correlation differs quite drastically between the variables and the backward variable selection method is adopted.

Multiple Regression
bf_lm = lm(pct_bf~.,data=data_bf)
summary(bf_lm)
## 
## Call:
## lm(formula = pct_bf ~ ., data = data_bf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.8684 -2.9088 -0.1904  3.0491 11.1421 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.20340    6.83392   0.322  0.74742    
## neck        -0.45612    0.23034  -1.980  0.04882 *  
## chest       -0.13005    0.09197  -1.414  0.15866    
## abdomen      1.03299    0.07638  13.524  < 2e-16 ***
## waist             NA         NA      NA       NA    
## hip         -0.33000    0.12768  -2.585  0.01034 *  
## thigh        0.08793    0.13395   0.656  0.51217    
## knee        -0.13537    0.22744  -0.595  0.55227    
## ankle        0.05505    0.21751   0.253  0.80041    
## bicep        0.17762    0.17029   1.043  0.29798    
## forearm      0.19468    0.20718   0.940  0.34834    
## wrist       -1.52499    0.50529  -3.018  0.00282 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.341 on 239 degrees of freedom
## Multiple R-squared:  0.737,  Adjusted R-squared:  0.726 
## F-statistic: 66.98 on 10 and 239 DF,  p-value: < 2.2e-16

Using the individual p-value method, the varaibles that need to be dropped are chest, waist, thigh, knee,ankle, bicep, forearm with ankle being the first to drop down due to its high p-value. However, to double check, the AIC criterion will also be considered.

bf_step_back = step(bf_lm, direction = "backward",trace = FALSE)
summary(bf_step_back)
## 
## Call:
## lm(formula = pct_bf ~ neck + chest + abdomen + hip + bicep + 
##     wrist, data = data_bf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.668 -2.889 -0.361  3.210 11.148 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.52703    6.63727   0.230 0.818232    
## neck        -0.39650    0.22234  -1.783 0.075783 .  
## chest       -0.12810    0.08992  -1.425 0.155562    
## abdomen      1.01805    0.07431  13.700  < 2e-16 ***
## hip         -0.28758    0.09232  -3.115 0.002060 ** 
## bicep        0.26094    0.15160   1.721 0.086469 .  
## wrist       -1.55084    0.45510  -3.408 0.000767 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.32 on 243 degrees of freedom
## Multiple R-squared:  0.7353, Adjusted R-squared:  0.7287 
## F-statistic: 112.5 on 6 and 243 DF,  p-value: < 2.2e-16

Based on the backward selection model, the fitted model has become:

$ = 1.52 -0.3965neck - 0.128chest + 1.01805abdomen -0.28758hip + 0.26bicep -1.55084wrist $

Finally, to check assumption, we perform the ggfortify function.

par(mfrow=c(1,2))
plot(bf_step_back,which=1:2) + theme_bw()

## NULL

The QQ plot shows a straight line which indicates that the normality assumption is reasonable. However, the residuals vs fitted plot shows a slight variation; but given that body fat is hard to predict, this is acceptable.

Final fitted model

$ = 1.52 -0.3965neck - 0.128chest + 1.01805abdomen -0.28758hip + 0.26bicep -1.55084wrist $

BMI

For this analysis, the formula of BMI is \(BMI = \frac{Weight (lbs)*703}{Height(in)^2}\)

Data Visualisation
qtlcharts::iplotCorr(data_bmi)

Based on the interactive correlation matrix, it can be seen the level of correlation differs quite drastically between the variables and the backward variable selection method is adopted.

Multiple Regression
bmi_lm = lm(bmi~.,data=data_bmi)
summary(bmi_lm)
## 
## Call:
## lm(formula = bmi ~ ., data = data_bmi)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1538 -0.6529  0.0036  0.6464  3.7589 
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11.337205   1.667286  -6.800 8.32e-11 ***
## neck          0.031220   0.056196   0.556    0.579    
## chest         0.148829   0.022439   6.633 2.18e-10 ***
## abdomen       0.130813   0.018636   7.020 2.29e-11 ***
## waist               NA         NA      NA       NA    
## hip           0.048917   0.031149   1.570    0.118    
## thigh         0.135537   0.032679   4.147 4.67e-05 ***
## knee         -0.253557   0.055488  -4.570 7.84e-06 ***
## ankle         0.056067   0.053066   1.057    0.292    
## bicep         0.051276   0.041545   1.234    0.218    
## forearm       0.076917   0.050545   1.522    0.129    
## wrist         0.005644   0.123276   0.046    0.964    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.059 on 239 degrees of freedom
## Multiple R-squared:  0.9039, Adjusted R-squared:  0.8998 
## F-statistic: 224.7 on 10 and 239 DF,  p-value: < 2.2e-16

Using the individual p-value method, the varaibles that need to be dropped are hip, ankle, bicep, forearm and wrist. To double check, the AIC criterion will also be considered.

bmi_step_back = step(bmi_lm, direction = "backward",trace = FALSE)
summary(bmi_step_back)
## 
## Call:
## lm(formula = bmi ~ chest + abdomen + hip + thigh + knee + forearm, 
##     data = data_bmi)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1197 -0.6944 -0.0274  0.6831  3.8464 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -10.94257    1.43829  -7.608 6.10e-13 ***
## chest         0.16090    0.02122   7.582 7.18e-13 ***
## abdomen       0.12726    0.01826   6.968 3.01e-11 ***
## hip           0.05047    0.03084   1.637   0.1030    
## thigh         0.14983    0.03032   4.942 1.44e-06 ***
## knee         -0.23116    0.05148  -4.490 1.10e-05 ***
## forearm       0.11484    0.04468   2.571   0.0108 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.058 on 243 degrees of freedom
## Multiple R-squared:  0.9024, Adjusted R-squared:    0.9 
## F-statistic: 374.6 on 6 and 243 DF,  p-value: < 2.2e-16

Based on the backward selection model, the fitted model has become:

$ = -10.94 +0.161chest + 0.127abdomen + 0.050hip + 0.150 thigh - 0.23knee + 0.115forearm $

Finally, to check assumption, we perform the ggfortify function.

par(mfrow=c(1,2))
plot(bmi_step_back,which=1:2) + theme_bw()

## NULL

The QQ plot shows a straight line which indicates that the normality assumption is reasonable. However, the residuals vs fitted plot shows a fan shaped plot which indicates that the assumption of homogeneous variance is violated. We can use a log transformed response and re-fit the linear regression.

ln_bmi_lm = lm(log(bmi)~.,data=data_bmi)
summary(ln_bmi_lm)
## 
## Call:
## lm(formula = log(bmi) ~ ., data = data_bmi)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.131845 -0.026047  0.000653  0.027572  0.107675 
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.7820756  0.0634046  28.106  < 2e-16 ***
## neck         0.0017372  0.0021370   0.813 0.417073    
## chest        0.0054462  0.0008533   6.382 8.98e-10 ***
## abdomen      0.0051190  0.0007087   7.223 6.76e-12 ***
## waist               NA         NA      NA       NA    
## hip          0.0004404  0.0011846   0.372 0.710402    
## thigh        0.0061100  0.0012427   4.917 1.64e-06 ***
## knee        -0.0076085  0.0021101  -3.606 0.000379 ***
## ankle        0.0020730  0.0020180   1.027 0.305333    
## bicep        0.0025240  0.0015799   1.598 0.111461    
## forearm      0.0033017  0.0019222   1.718 0.087150 .  
## wrist        0.0009228  0.0046880   0.197 0.844112    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04028 on 239 degrees of freedom
## Multiple R-squared:  0.9066, Adjusted R-squared:  0.9027 
## F-statistic:   232 on 10 and 239 DF,  p-value: < 2.2e-16
ln_bmi_step_back = step(ln_bmi_lm, direction = "backward",trace = FALSE)
summary(ln_bmi_step_back)
## 
## Call:
## lm(formula = log(bmi) ~ chest + abdomen + thigh + knee + bicep + 
##     forearm, data = data_bmi)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.129144 -0.024844  0.000147  0.028553  0.111637 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.8276641  0.0495199  36.908  < 2e-16 ***
## chest        0.0057533  0.0008218   7.001 2.47e-11 ***
## abdomen      0.0051792  0.0006497   7.972 6.10e-14 ***
## thigh        0.0064286  0.0009988   6.436 6.48e-10 ***
## knee        -0.0064618  0.0018831  -3.431 0.000705 ***
## bicep        0.0028200  0.0015464   1.824 0.069436 .  
## forearm      0.0039923  0.0018360   2.174 0.030638 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04015 on 243 degrees of freedom
## Multiple R-squared:  0.9056, Adjusted R-squared:  0.9033 
## F-statistic: 388.7 on 6 and 243 DF,  p-value: < 2.2e-16
par(mfrow=c(1,2))
plot(ln_bmi_step_back,which=1:2) + theme_bw()

## NULL
sjPlot::tab_model(bmi_step_back, ln_bmi_step_back, digits = 5, show.ci = FALSE)
  bmi log(bmi)
Predictors Estimates p Estimates p
(Intercept) -10.94257 <0.001 1.82766 <0.001
chest 0.16090 <0.001 0.00575 <0.001
abdomen 0.12726 <0.001 0.00518 <0.001
hip 0.05047 0.103
thigh 0.14983 <0.001 0.00643 <0.001
knee -0.23116 <0.001 -0.00646 0.001
forearm 0.11484 0.011 0.00399 0.031
bicep 0.00282 0.069
Observations 250 250
R2 / R2 adjusted 0.902 / 0.900 0.906 / 0.903
Final Fitted Model

$log() = 1.83 +0.0058chest + 0.0052abdomen + 0.0064 thigh -0.0065knee + 0.0028bicep + 0.0040 forearm $.

Conclusion

sjPlot::tab_model(bf_step_back, ln_bmi_step_back, digits = 5, show.ci = FALSE)
  pct bf log(bmi)
Predictors Estimates p Estimates p
(Intercept) 1.52703 0.818 1.82766 <0.001
neck -0.39650 0.076
chest -0.12810 0.156 0.00575 <0.001
abdomen 1.01805 <0.001 0.00518 <0.001
hip -0.28758 0.002
bicep 0.26094 0.086 0.00282 0.069
wrist -1.55084 0.001
thigh 0.00643 <0.001
knee -0.00646 0.001
forearm 0.00399 0.031
Observations 250 250
R2 / R2 adjusted 0.735 / 0.729 0.906 / 0.903

Through looking at the two models, we can see that using simply body measurements, it is easier to predict changes in bmi rather than percentage body fat. From the models, measurements of abdomen appears to have the greatest influence on both body fat and bmi and hence any increase should be treated with caution.

2.2 Weight

2.2.1 Defining the model with population parameters

Since the body density is calculated based on weight, height, neck and other variables below in our data set. It is better not include it in our full model. \[ Body Weight = \beta_0 + \beta_1age + \beta_2percentageofbodyfat + \beta_3height\\ + \beta_4neck + \beta_5chest + \beta_6abdomen + \beta_7waist + \beta_8hip + \beta_{9}thigh\\ + \beta_{10}knee + \beta_{11}ankle + \beta_{12}bicep + \beta_{13}forearm + \beta_{14}wrist + \epsilon \] ##### 2.2.2 Check Assumptions: The residuals \(\epsilon_i\) are iid \(N(0,\sigma^2)\) and there is a linear relationship between y and x.

dataqf1=within(data,rm(density))
weight_lm = lm(weight ~ ., dataqf1)
summary(weight_lm)
## 
## Call:
## lm(formula = weight ~ ., data = dataqf1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8622 -0.8906 -0.1431  0.7716  6.0540 
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.528e+02  3.701e+00 -95.334  < 2e-16 ***
## pct_bf      -5.562e-02  2.699e-02  -2.061  0.04038 *  
## age         -5.065e-04  1.352e-02  -0.037  0.97015    
## height       4.808e+00  1.047e-01  45.925  < 2e-16 ***
## neck         1.407e-01  9.742e-02   1.444  0.15006    
## chest        6.062e-02  4.536e-02   1.336  0.18269    
## abdomen      4.711e-02  4.505e-02   1.046  0.29677    
## waist               NA         NA      NA       NA    
## hip         -5.715e-02  6.253e-02  -0.914  0.36163    
## thigh        1.245e-01  6.056e-02   2.055  0.04096 *  
## knee         2.936e-01  1.010e-01   2.906  0.00401 ** 
## ankle       -1.031e-02  9.160e-02  -0.113  0.91044    
## bicep        1.198e-01  7.034e-02   1.703  0.08996 .  
## forearm     -3.882e-02  8.635e-02  -0.450  0.65346    
## wrist       -1.409e-02  2.268e-01  -0.062  0.95053    
## bmi          6.440e+00  1.980e-01  32.522  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.763 on 235 degrees of freedom
## Multiple R-squared:  0.996,  Adjusted R-squared:  0.9957 
## F-statistic:  4167 on 14 and 235 DF,  p-value: < 2.2e-16
autoplot(weight_lm, which = 1:2) + theme_bw()

In the plot above the residuals are above zero from the beginning, then they go below zero and end up again above zero for the end. This means the linearity assumption fails. We underestimate the weight variable at the start and the end and overestimate the weight at medium. To address this problem, I transform the weight to sqrt(weight).

dataqf=dataqf1%>%mutate(weight=weight^(1/2))
weight_lmqf = lm(weight ~ ., dataqf)
summary(weight_lmqf)
## 
## Call:
## lm(formula = weight ~ ., data = dataqf)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.31815 -0.01805  0.00782  0.02789  0.09803 
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.637e+00  1.073e-01 -61.872  < 2e-16 ***
## pct_bf      -3.807e-05  7.823e-04  -0.049   0.9612    
## age          2.271e-04  3.920e-04   0.579   0.5629    
## height       1.810e-01  3.035e-03  59.648  < 2e-16 ***
## neck         5.860e-03  2.824e-03   2.075   0.0391 *  
## chest        1.839e-03  1.315e-03   1.399   0.1633    
## abdomen      1.154e-03  1.306e-03   0.884   0.3778    
## waist               NA         NA      NA       NA    
## hip         -4.131e-03  1.813e-03  -2.279   0.0236 *  
## thigh        6.982e-03  1.755e-03   3.977 9.28e-05 ***
## knee         1.180e-02  2.929e-03   4.027 7.61e-05 ***
## ankle        7.045e-04  2.655e-03   0.265   0.7910    
## bicep        4.494e-03  2.039e-03   2.204   0.0285 *  
## forearm      1.553e-03  2.503e-03   0.621   0.5354    
## wrist        3.707e-03  6.576e-03   0.564   0.5734    
## bmi          2.358e-01  5.740e-03  41.080  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0511 on 235 degrees of freedom
## Multiple R-squared:  0.9976, Adjusted R-squared:  0.9974 
## F-statistic:  6901 on 14 and 235 DF,  p-value: < 2.2e-16
autoplot(weight_lmqf, which = 1:2) + theme_bw()
The linearity looks much better now.
  • Homoskedasticity: In the scatterplot above,The spread looks reasonably constant. .
  • Normality: In the QQ plot above, the tail is below the diagonal line and the first value is a significant outlier. However, we have quite a large sample size so we can rely on the central limit theorem to give us approximately valid inferences.
  • qtlcharts::iplotCorr(dataqf)

    It seems there is no direct linear relationship between age and weight. I will do further reserch to work out the appropriate model. ##### 2.2.3 Dropping and adding variables using the AIC starting from the full model

    M0 = lm(weight ~ 1, data = dataqf)
    step.fwd.aic = step(M0, scope = list(lower = M0, upper = weight_lmqf), direction = "forward", trace = FALSE)
    step.back.aic = step(weight_lmqf, direction = "backward", trace = FALSE)
    summary(step.back.aic)
    ## 
    ## Call:
    ## lm(formula = weight ~ height + neck + chest + hip + thigh + knee + 
    ##     bicep + bmi, data = dataqf)
    ## 
    ## Residuals:
    ##      Min       1Q   Median       3Q      Max 
    ## -0.32442 -0.01876  0.00895  0.02952  0.10609 
    ## 
    ## Coefficients:
    ##              Estimate Std. Error t value Pr(>|t|)    
    ## (Intercept) -6.627483   0.093473 -70.903  < 2e-16 ***
    ## height       0.181653   0.002686  67.629  < 2e-16 ***
    ## neck         0.007072   0.002562   2.760 0.006221 ** 
    ## chest        0.002300   0.001224   1.879 0.061438 .  
    ## hip         -0.003906   0.001744  -2.240 0.025983 *  
    ## thigh        0.006039   0.001580   3.823 0.000168 ***
    ## knee         0.013245   0.002696   4.913 1.65e-06 ***
    ## bicep        0.004560   0.001898   2.402 0.017048 *  
    ## bmi          0.238754   0.005008  47.671  < 2e-16 ***
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## Residual standard error: 0.05079 on 241 degrees of freedom
    ## Multiple R-squared:  0.9975, Adjusted R-squared:  0.9975 
    ## F-statistic: 1.222e+04 on 8 and 241 DF,  p-value: < 2.2e-16
    summary(step.fwd.aic)
    ## 
    ## Call:
    ## lm(formula = weight ~ hip + neck + height + bmi + knee + thigh + 
    ##     bicep + chest, data = dataqf)
    ## 
    ## Residuals:
    ##      Min       1Q   Median       3Q      Max 
    ## -0.32442 -0.01876  0.00895  0.02952  0.10609 
    ## 
    ## Coefficients:
    ##              Estimate Std. Error t value Pr(>|t|)    
    ## (Intercept) -6.627483   0.093473 -70.903  < 2e-16 ***
    ## hip         -0.003906   0.001744  -2.240 0.025983 *  
    ## neck         0.007072   0.002562   2.760 0.006221 ** 
    ## height       0.181653   0.002686  67.629  < 2e-16 ***
    ## bmi          0.238754   0.005008  47.671  < 2e-16 ***
    ## knee         0.013245   0.002696   4.913 1.65e-06 ***
    ## thigh        0.006039   0.001580   3.823 0.000168 ***
    ## bicep        0.004560   0.001898   2.402 0.017048 *  
    ## chest        0.002300   0.001224   1.879 0.061438 .  
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## Residual standard error: 0.05079 on 241 degrees of freedom
    ## Multiple R-squared:  0.9975, Adjusted R-squared:  0.9975 
    ## F-statistic: 1.222e+04 on 8 and 241 DF,  p-value: < 2.2e-16
    Both forward and backward search using AIC give the same result. ##### 2.2.4 Fitted model for the model selected by the step-wise procedure. \[ sqrt(Weight) = -5.547979 + 0.030800 \times hip+ 0.032385 \times neck + 0.076563 \times height\\ + 0.031738 \times chest + 0.017835 \times thigh +0.019509 \times abdomen + 0.023096 \times ankle\\ + 0.018840 \times bicep +0.017246 \times forearm + 0.050826 \times wrist - 0.003182 \times age + 0.018287 \times knee \] Looking at the \(R^2\) value (multiple R-squared) from the summary output, 98% of the variability of age is explained by the regression on hip, neck, height, chest, thigh, abdomen, ankle, bicep, firearm, wrist, age and knee circumference.
    1. On average, holding the other variables constant, a 1 unit increase in hip leads to a 0.031 unit increase in sqrt(weight).
    2. On average, holding the other variables constant, a 1 unit increase in neck leads to a 0.032 unit increase in sqrt(weight).
    3. On average, holding the other variables constant, a 1 unit increase in height leads to a 0.077 unit increase in sqrt(weight).
    4. On average, holding the other variables constant, a 1 unit increase in chest leads to a 0.032 unit increase in sqrt(weight).
    5. On average, holding the other variables constant, a 1 unit increase in thigh leads to a 0.018 unit increase in sqrt(weight).
    6. On average, holding the other variables constant, a 1 unit increase in abdomen leads to a 0.02 unit increase in sqrt(weight).
    7. On average, holding the other variables constant, a 1 unit increase in ankle leads to a 0.023 unit increase in sqrt(weight).
    8. On average, holding the other variables constant, a 1 unit increase in bicep leads to a 0.0188 unit increase in sqrt(weight).
    9. On average, holding the other variables constant, a 1 unit increase in forearm leads to a 0.017 unit increase in sqrt(weight).
    10. On average, holding the other variables constant, a 1 unit increase in wrist leads to a 0.05 unit increase in sqrt(weight).
    11. On average, holding the other variables constant, a 1 unit increase in age leads to a -0.003 unit increase in sqrt(weight).
    12. On average, holding the other variables constant, a 1 unit increase in knee leads to a 0.018 unit increase in sqrt(weight).

    2.3 Body Density

    2.3.1 Defining the model with population parameters

    \[ Density = \beta_0 + \beta_1density + \beta_2age + \beta_3weight + \beta_4height\\ + \beta_5neck + \beta_6chest + \beta_7abdomen + \beta_8waist + \beta_9hip + \beta_{10}thigh\\ + \beta_{11}knee + \beta_{12}ankle + \beta_{13}bicep + \beta_{14}forearm + \beta_{15}wrist + \epsilon \]

    cor_matrix <- cor(data)
    pheatmap(cor_matrix, display_numbers = T,na.rm=T)

    Above matrix has shown the interactice correlation between variables. Notbaly, Pct.BF has a -0.99 relationship with Density, which means Pct.BF could be used to expalin Density. Meanwhile, variables having similar properties are linked together, which could be useful for generating groups.

    2.3.2 Check Assumptions:

    The residuals \(\epsilon_i\) are iid \(N(0,\sigma^2)\) and there is a linear relationship between y and x.

    data1<-data[,-2]
    M0 <- lm(density ~ 1, data = data1)  # Null model
    M1 <- lm(density ~ ., data = data1)  # Full model
    autoplot(M1,which=1:2)+theme_bw()

    round(summary(M1)$coef, 3)
    ##             Estimate Std. Error t value Pr(>|t|)
    ## (Intercept)    1.396      0.128  10.872    0.000
    ## age            0.000      0.000  -2.090    0.038
    ## weight         0.001      0.000   2.574    0.011
    ## height        -0.004      0.002  -2.218    0.028
    ## neck           0.001      0.001   1.671    0.096
    ## chest          0.000      0.000   1.712    0.088
    ## abdomen       -0.002      0.000  -9.673    0.000
    ## hip            0.001      0.000   1.935    0.054
    ## thigh         -0.001      0.000  -1.523    0.129
    ## knee           0.000      0.001  -0.255    0.799
    ## ankle         -0.001      0.001  -1.020    0.309
    ## bicep         -0.001      0.000  -1.494    0.136
    ## forearm       -0.001      0.000  -1.072    0.285
    ## wrist          0.005      0.001   3.680    0.000
    ## bmi           -0.007      0.003  -2.604    0.010
    step.fwd.aic <- step(M0, scope = list(lower = M0, upper = M1), direction = "forward", trace = FALSE)
    summary(step.fwd.aic)
    ## 
    ## Call:
    ## lm(formula = density ~ waist + weight + wrist + bicep + ankle, 
    ##     data = data1)
    ## 
    ## Residuals:
    ##       Min        1Q    Median        3Q       Max 
    ## -0.019954 -0.007519  0.000622  0.006554  0.035375 
    ## 
    ## Coefficients:
    ##               Estimate Std. Error t value Pr(>|t|)    
    ## (Intercept)  1.179e+00  1.773e-02  66.482  < 2e-16 ***
    ## waist       -5.776e-03  3.362e-04 -17.177  < 2e-16 ***
    ## weight       3.354e-04  6.879e-05   4.876 1.96e-06 ***
    ## wrist        4.006e-03  1.036e-03   3.866 0.000142 ***
    ## bicep       -8.700e-04  3.533e-04  -2.463 0.014485 *  
    ## ankle       -7.717e-04  4.978e-04  -1.550 0.122346    
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## Residual standard error: 0.009974 on 244 degrees of freedom
    ## Multiple R-squared:  0.7264, Adjusted R-squared:  0.7208 
    ## F-statistic: 129.5 on 5 and 244 DF,  p-value: < 2.2e-16
    step.back.aic <- step(M1, scope = list(lower = M0, upper = M1), direction = "backward", trace = FALSE)
    summary(step.back.aic)
    ## 
    ## Call:
    ## lm(formula = density ~ age + weight + height + neck + chest + 
    ##     abdomen + hip + thigh + bicep + wrist + bmi, data = data1)
    ## 
    ## Residuals:
    ##       Min        1Q    Median        3Q       Max 
    ## -0.021592 -0.007103  0.000582  0.006697  0.032460 
    ## 
    ## Coefficients:
    ##               Estimate Std. Error t value Pr(>|t|)    
    ## (Intercept)  1.3918103  0.1265874  10.995  < 2e-16 ***
    ## age         -0.0001478  0.0000715  -2.067 0.039857 *  
    ## weight       0.0009084  0.0003521   2.580 0.010477 *  
    ## height      -0.0042403  0.0018064  -2.347 0.019724 *  
    ## neck         0.0009150  0.0005263   1.739 0.083389 .  
    ## chest        0.0004526  0.0002509   1.804 0.072534 .  
    ## abdomen     -0.0020091  0.0002101  -9.565  < 2e-16 ***
    ## hip          0.0007099  0.0003423   2.074 0.039138 *  
    ## thigh       -0.0005467  0.0003244  -1.685 0.093286 .  
    ## bicep       -0.0006743  0.0003751  -1.798 0.073508 .  
    ## wrist        0.0039956  0.0011742   3.403 0.000783 ***
    ## bmi         -0.0070109  0.0025064  -2.797 0.005577 ** 
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## Residual standard error: 0.009778 on 238 degrees of freedom
    ## Multiple R-squared:  0.7435, Adjusted R-squared:  0.7316 
    ## F-statistic: 62.71 on 11 and 238 DF,  p-value: < 2.2e-16
    exh <- regsubsets(density~., data = data1, nvmax = 15)
    ## Warning in leaps.exhaustive(a, really.big): XHAUST returned error code -999
    plot(exh,scale="bic")

    It is best to choose age, weight, height, neck and chest for conducting the relationship analysis with body density.

    2.3.3 Dropping variables using the AIC starting from the full model
    Hypothesis: \[ H_0: \alpha_1 = \alpha_1 = ... = \alpha_g \\ H_1: \text{ Not all means are the same.}\\ \] Assumptions: >>>>>>> 62e0a8412e27ea0e3d0247c40f04dfa9ef0c7a0d
    1. Linearity: In the scatterplot above, .
    2. Homoskedasticity: In the scatterplot above, .
    3. Normality: In the QQ plot above, .
    M2<- lm(formula = density ~ age + weight + height + neck + chest + abdomen , 
        data = data1)
    summary(M2)
    ## 
    ## Call:
    ## lm(formula = density ~ age + weight + height + neck + chest + 
    ##     abdomen, data = data1)
    ## 
    ## Residuals:
    ##       Min        1Q    Median        3Q       Max 
    ## -0.023346 -0.007573 -0.000392  0.006861  0.040282 
    ## 
    ## Coefficients:
    ##               Estimate Std. Error t value Pr(>|t|)    
    ## (Intercept)  1.103e+00  3.748e-02  29.420   <2e-16 ***
    ## age         -1.484e-05  6.341e-05  -0.234   0.8152    
    ## weight       7.889e-05  1.074e-04   0.735   0.4633    
    ## height       8.984e-04  4.061e-04   2.212   0.0279 *  
    ## neck         1.056e-03  5.054e-04   2.089   0.0378 *  
    ## chest        3.303e-04  2.397e-04   1.378   0.1695    
    ## abdomen     -2.130e-03  1.976e-04 -10.779   <2e-16 ***
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## Residual standard error: 0.0102 on 243 degrees of freedom
    ## Multiple R-squared:  0.7151, Adjusted R-squared:  0.7081 
    ## F-statistic: 101.7 on 6 and 243 DF,  p-value: < 2.2e-16
    M3<- lm(formula = density ~ height + neck + chest + abdomen , 
        data = data1)
    summary(M3)
    ## 
    ## Call:
    ## lm(formula = density ~ height + neck + chest + abdomen, data = data1)
    ## 
    ## Residuals:
    ##       Min        1Q    Median        3Q       Max 
    ## -0.023246 -0.007717 -0.000356  0.006985  0.039598 
    ## 
    ## Coefficients:
    ##               Estimate Std. Error t value Pr(>|t|)    
    ## (Intercept)  1.0746853  0.0184258  58.325  < 2e-16 ***
    ## height       0.0011729  0.0002615   4.485 1.12e-05 ***
    ## neck         0.0012053  0.0004577   2.634  0.00898 ** 
    ## chest        0.0004285  0.0002058   2.082  0.03834 *  
    ## abdomen     -0.0020607  0.0001537 -13.403  < 2e-16 ***
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## Residual standard error: 0.01018 on 245 degrees of freedom
    ## Multiple R-squared:  0.7139, Adjusted R-squared:  0.7093 
    ## F-statistic: 152.9 on 4 and 245 DF,  p-value: < 2.2e-16
    relweights <- function(fit,...){
      R <- cor(fit$model)
      nvar <- ncol(R)
      rxx <- R[2:nvar, 2:nvar]
      rxy <- R[2:nvar, 1]
      svd <- eigen(rxx)
      evec <- svd$vectors
      ev <- svd$values
      delta <- diag(sqrt(ev))
      lambda <- evec %*% delta %*% t(evec)
      lambdasq <- lambda ^ 2
      beta <- solve(lambda) %*% rxy
      rsquare <- colSums(beta ^ 2)
      rawwgt <- lambdasq %*% beta ^ 2
      import <- (rawwgt / rsquare) * 100
      import <- as.data.frame(import)
      row.names(import) <- names(fit$model[2:nvar])
      names(import) <- "Weights"
      import <- import[order(import),1, drop=FALSE]
      dotchart(import$Weights, labels=row.names(import),
               xlab="% of R-Square", pch=19,
               main="Relative Importance of Predictor Variables",
               sub=paste("Total R-Square=", round(rsquare, digits=3)),
               ...)
      return(import)
    }
    relweights(M3, col="blue")

    ##           Weights
    ## height   2.905349
    ## neck    12.624373
    ## chest   29.711934
    ## abdomen 54.758344
    <<<<<<< HEAD
    2.1.3 Dropping variables using the AIC starting from the full model
    pbf_step = step(pbf_lm, direction = "backward")
    ## Start:  AIC=137.29
    ## pct_bf ~ density + age + weight + height + neck + chest + abdomen + 
    ##     waist + hip + thigh + knee + ankle + bicep + forearm + wrist + 
    ##     bmi
    ## 
    ## 
    ## Step:  AIC=137.29
    ## pct_bf ~ density + age + weight + height + neck + chest + abdomen + 
    ##     hip + thigh + knee + ankle + bicep + forearm + wrist + bmi
    ## 
    ##           Df Sum of Sq    RSS    AIC
    ## - wrist    1       0.0  380.9 135.29
    ## - neck     1       0.1  381.0 135.34
    ## - forearm  1       0.2  381.1 135.40
    ## - knee     1       0.2  381.2 135.44
    ## - thigh    1       0.5  381.4 135.60
    ## - abdomen  1       0.6  381.6 135.70
    ## - chest    1       0.8  381.8 135.84
    ## - hip      1       1.1  382.1 136.03
    ## - bicep    1       1.8  382.8 136.49
    ## - bmi      1       1.8  382.8 136.49
    ## - height   1       2.0  382.9 136.57
    ## - ankle    1       2.1  383.0 136.67
    ## - weight   1       2.7  383.6 137.04
    ## - age      1       2.9  383.8 137.18
    ## <none>                  380.9 137.29
    ## - density  1    3809.8 4190.7 734.79
    ## 
    ## Step:  AIC=135.29
    ## pct_bf ~ density + age + weight + height + neck + chest + abdomen + 
    ##     hip + thigh + knee + ankle + bicep + forearm + bmi
    ## 
    ##           Df Sum of Sq    RSS    AIC
    ## - neck     1       0.1  381.0 133.35
    ## - forearm  1       0.2  381.1 133.40
    ## - knee     1       0.2  381.2 133.45
    ## - thigh    1       0.5  381.4 133.60
    ## - abdomen  1       0.6  381.6 133.70
    ## - chest    1       0.8  381.8 133.84
    ## - hip      1       1.1  382.1 134.03
    ## - bicep    1       1.8  382.8 134.50
    ## - bmi      1       1.9  382.8 134.51
    ## - height   1       2.0  382.9 134.58
    ## - ankle    1       2.2  383.2 134.76
    ## - weight   1       2.7  383.6 135.05
    ## <none>                  380.9 135.29
    ## - age      1       3.3  384.2 135.45
    ## - density  1    4030.1 4411.1 745.60
    ## 
    ## Step:  AIC=133.35
    ## pct_bf ~ density + age + weight + height + chest + abdomen + 
    ##     hip + thigh + knee + ankle + bicep + forearm + bmi
    ## 
    ##           Df Sum of Sq    RSS    AIC
    ## - forearm  1       0.1  381.1 131.43
    ## - knee     1       0.2  381.2 131.49
    ## - thigh    1       0.5  381.5 131.69
    ## - abdomen  1       0.6  381.6 131.74
    ## - chest    1       0.9  381.9 131.90
    ## - hip      1       1.3  382.3 132.22
    ## - bmi      1       1.9  382.9 132.59
    ## - bicep    1       1.9  382.9 132.61
    ## - height   1       2.0  383.0 132.64
    ## - ankle    1       2.2  383.2 132.79
    ## - weight   1       2.6  383.6 133.06
    ## <none>                  381.0 133.35
    ## - age      1       3.2  384.2 133.46
    ## - density  1    4170.5 4551.5 751.44
    ## 
    ## Step:  AIC=131.43
    ## pct_bf ~ density + age + weight + height + chest + abdomen + 
    ##     hip + thigh + knee + ankle + bicep + bmi
    ## 
    ##           Df Sum of Sq    RSS    AIC
    ## - knee     1       0.2  381.3 129.56
    ## - abdomen  1       0.5  381.7 129.78
    ## - thigh    1       0.5  381.7 129.78
    ## - chest    1       0.8  382.0 129.98
    ## - hip      1       1.3  382.4 130.25
    ## - bicep    1       1.8  382.9 130.61
    ## - bmi      1       1.8  383.0 130.61
    ## - height   1       1.9  383.1 130.68
    ## - ankle    1       2.2  383.4 130.88
    ## - weight   1       2.6  383.8 131.13
    ## <none>                  381.1 131.43
    ## - age      1       3.2  384.3 131.51
    ## - density  1    4170.8 4551.9 749.46
    ## 
    ## Step:  AIC=129.56
    ## pct_bf ~ density + age + weight + height + chest + abdomen + 
    ##     hip + thigh + ankle + bicep + bmi
    ## 
    ##           Df Sum of Sq    RSS    AIC
    ## - abdomen  1       0.5  381.9 127.92
    ## - thigh    1       0.8  382.1 128.06
    ## - chest    1       0.9  382.2 128.13
    ## - hip      1       1.2  382.5 128.31
    ## - bmi      1       1.7  383.0 128.66
    ## - bicep    1       1.8  383.1 128.71
    ## - height   1       1.8  383.2 128.75
    ## - weight   1       2.4  383.8 129.15
    ## - ankle    1       2.7  384.0 129.29
    ## - age      1       3.0  384.3 129.51
    ## <none>                  381.3 129.56
    ## - density  1    4170.8 4552.1 747.47
    ## 
    ## Step:  AIC=127.92
    ## pct_bf ~ density + age + weight + height + chest + hip + thigh + 
    ##     ankle + bicep + bmi
    ## 
    ##           Df Sum of Sq    RSS    AIC
    ## - thigh    1       0.8  382.7 126.43
    ## - chest    1       1.3  383.2 126.79
    ## - bmi      1       1.6  383.5 126.96
    ## - hip      1       1.6  383.5 126.98
    ## - height   1       1.9  383.7 127.13
    ## - bicep    1       2.3  384.2 127.44
    ## - weight   1       2.7  384.6 127.66
    ## <none>                  381.9 127.92
    ## - ankle    1       3.1  385.0 127.96
    ## - age      1       4.5  386.4 128.85
    ## - density  1    6136.5 6518.4 835.23
    ## 
    ## Step:  AIC=126.43
    ## pct_bf ~ density + age + weight + height + chest + hip + ankle + 
    ##     bicep + bmi
    ## 
    ##           Df Sum of Sq    RSS    AIC
    ## - hip      1       1.0  383.7 125.10
    ## - bmi      1       1.4  384.0 125.32
    ## - height   1       1.5  384.2 125.41
    ## - chest    1       2.0  384.6 125.71
    ## - weight   1       2.2  384.9 125.86
    ## <none>                  382.7 126.43
    ## - ankle    1       3.2  385.9 126.50
    ## - bicep    1       3.2  385.9 126.51
    ## - age      1       6.9  389.6 128.90
    ## - density  1    6219.1 6601.8 836.41
    ## 
    ## Step:  AIC=125.1
    ## pct_bf ~ density + age + weight + height + chest + ankle + bicep + 
    ##     bmi
    ## 
    ##           Df Sum of Sq    RSS    AIC
    ## - bmi      1       0.9  384.6 123.71
    ## - height   1       1.2  384.9 123.87
    ## - chest    1       1.4  385.1 124.00
    ## - weight   1       2.3  386.0 124.60
    ## <none>                  383.7 125.10
    ## - ankle    1       3.5  387.2 125.39
    ## - bicep    1       3.7  387.4 125.50
    ## - age      1       6.3  390.0 127.14
    ## - density  1    6294.6 6678.3 837.29
    ## 
    ## Step:  AIC=123.71
    ## pct_bf ~ density + age + weight + height + chest + ankle + bicep
    ## 
    ##           Df Sum of Sq    RSS    AIC
    ## - height   1       0.4  385.1 122.00
    ## - chest    1       1.1  385.7 122.41
    ## <none>                  384.6 123.71
    ## - bicep    1       3.5  388.2 123.99
    ## - ankle    1       3.8  388.4 124.14
    ## - weight   1       5.0  389.7 124.96
    ## - age      1       6.8  391.4 126.09
    ## - density  1    6598.1 6982.8 846.43
    ## 
    ## Step:  AIC=122
    ## pct_bf ~ density + age + weight + chest + ankle + bicep
    ## 
    ##           Df Sum of Sq    RSS    AIC
    ## - chest    1       2.5  387.5 121.59
    ## <none>                  385.1 122.00
    ## - bicep    1       3.2  388.3 122.06
    ## - ankle    1       3.7  388.8 122.42
    ## - weight   1       6.0  391.1 123.86
    ## - age      1       6.6  391.7 124.27
    ## - density  1    7697.2 8082.3 880.99
    ## 
    ## Step:  AIC=121.59
    ## pct_bf ~ density + age + weight + ankle + bicep
    ## 
    ##           Df Sum of Sq    RSS    AIC
    ## - bicep    1       3.0  390.5 121.49
    ## <none>                  387.5 121.59
    ## - ankle    1       4.6  392.2 122.55
    ## - age      1      10.8  398.3 126.44
    ## - weight   1      27.6  415.1 136.77
    ## - density  1    8436.8 8824.3 900.95
    ## 
    ## Step:  AIC=121.49
    ## pct_bf ~ density + age + weight + ankle
    ## 
    ##           Df Sum of Sq    RSS    AIC
    ## <none>                  390.5 121.49
    ## - ankle    1       4.5  395.0 122.34
    ## - age      1      11.5  402.0 126.74
    ## - weight   1      29.0  419.5 137.41
    ## - density  1    8435.0 8825.5 898.99

    Backwards selection using the AIC dropped all variables except for age, density and abdomen which are kept in the model.

    2.1.4 Fitted model for the model selected by the step-wise procedure.

    \[ Percentage of Body Fat = 442.3755 - 406.493 \times density\\ + 0.0118 \times age + 0.0576 \times abdomen\\ \] Looking at the \(R^2\) value (multiple R-squared) from the summary output, 98% of the variability of age is explained by the regression on density, age and abdomen circumference.

    1. On average, holding the other variables constant, a 1 \(gm/cm^3\) increase in density leads to a 400 unit decrease in percentage of body fat.
    2. On average, holding the other variables constant, a one year increase in age leads to a 0.0118 unit increase in percentage of body fat.
    3. On average, holding the other variables constant, a one year increase in abdomen circumference leads to a 0.0576 unit increase in percentage of body fat.
    #options("scipen"=100, "digits"=4)
    summary(pbf_step)
    ## 
    ## Call:
    ## lm(formula = pct_bf ~ density + age + weight + ankle, data = data)
    ## 
    ## Residuals:
    ##     Min      1Q  Median      3Q     Max 
    ## -8.4361 -0.3914 -0.1079  0.2276 15.3411 
    ## 
    ## Coefficients:
    ##               Estimate Std. Error t value Pr(>|t|)    
    ## (Intercept)  4.565e+02  6.562e+00  69.561  < 2e-16 ***
    ## density     -4.161e+02  5.720e+00 -72.748  < 2e-16 ***
    ## age          1.827e-02  6.803e-03   2.685  0.00774 ** 
    ## weight       1.927e-02  4.517e-03   4.267 2.83e-05 ***
    ## ankle       -1.015e-01  6.047e-02  -1.678  0.09458 .  
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## Residual standard error: 1.262 on 245 degrees of freedom
    ## Multiple R-squared:  0.9772, Adjusted R-squared:  0.9768 
    ## F-statistic:  2625 on 4 and 245 DF,  p-value: < 2.2e-16
    #options("scipen"=-100, "digits"=4)
    2.1.5 Linear regression assumptions for the stepwise model - why do this? same as previous??
    autoplot(pbf_step, which = 1:2) + theme_bw()

    2.2 Prediction for Obesity

    Due to the increasing consumptions of fast food and the increasing convenience of food deliveries, concerns about obesity level is rising throughput the world and has reached a new high. This increasing concern has lead to an increasing need to measure obesity accurately and percentage body fat is arguably the most accurate measure by far. However, the calculation of body fat is difficult and many has switched to Body Mass Index (BMI) for simpler calculation. This section is looking at comparing the results from predicting body fat percentage using other body measurements and predicting BMI using other body measurements to determine wh body measurement is the most important in determining obesity.

    Body Fat Percentage

    Data Visualisation
    qtlcharts::iplotCorr(data_bf)
    ## Set screen size to height=700 x width=1000

    Based on the interactive correlation matrix, it can be seen the level of correlation differs quite drastically between the variables and the backward variable selection method is adopted.

    =======
    2.3.4 Fitted model for the model selected by the step-wise procedure.

    \[ Body Density = 1.0746853 + 0.0011729 \times Height\\ + 0.0012053 \times Neck + 0.004285 \times Chest\\ - 0.0020607 \times Abdomen\\ \]

    Looking at the \(R^2\) value (multiple R-squared) from the summary output, 71.39% of the variability of density is explained by the regression on percentage of Height, Neck, Chest, Abdomen.
    1. On average, holding the other variables constant, a 1% increase in Height leads to a 0.0012% unit increase in Density
    2. On average, holding the other variables constant, a 1% increase in Neck leads to a 0.0012% increase in Density
    3. On average, holding the other variables constant, a 1% increase in Chest leads to a 0.0043% increase in Density
    4. On average, holding the other variables constant, a 1% increase in Abdomen leads to a 0.0021% decrease in Density
    2.3.5 Linear regression assumptions for the stepwise model - why do this? same as previous??
    autoplot(M3,which=1:2)+theme_bw()

    Hypothesis: \[ H_0: \alpha_1 = \alpha_1 = ... = \alpha_g \\ H_1: \text{ Not all means are the same.}\\ \] Assumptions:
    1. \(\epsilon_{ijk} \sim N(0, \sigma^2)\)
    >>>>>>> 62e0a8412e27ea0e3d0247c40f04dfa9ef0c7a0d
    Multiple Regression
    bf_lm = lm(pct_bf~.,data=data_bf)
    summary(bf_lm)
    ## 
    ## Call:
    ## lm(formula = pct_bf ~ ., data = data_bf)
    ## 
    ## Residuals:
    ##     Min      1Q  Median      3Q     Max 
    ## -9.8684 -2.9088 -0.1904  3.0491 11.1421 
    ## 
    ## Coefficients: (1 not defined because of singularities)
    ##             Estimate Std. Error t value Pr(>|t|)    
    ## (Intercept)  2.20340    6.83392   0.322  0.74742    
    ## neck        -0.45612    0.23034  -1.980  0.04882 *  
    ## chest       -0.13005    0.09197  -1.414  0.15866    
    ## abdomen      1.03299    0.07638  13.524  < 2e-16 ***
    ## waist             NA         NA      NA       NA    
    ## hip         -0.33000    0.12768  -2.585  0.01034 *  
    ## thigh        0.08793    0.13395   0.656  0.51217    
    ## knee        -0.13537    0.22744  -0.595  0.55227    
    ## ankle        0.05505    0.21751   0.253  0.80041    
    ## bicep        0.17762    0.17029   1.043  0.29798    
    ## forearm      0.19468    0.20718   0.940  0.34834    
    ## wrist       -1.52499    0.50529  -3.018  0.00282 ** 
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## Residual standard error: 4.341 on 239 degrees of freedom
    ## Multiple R-squared:  0.737,  Adjusted R-squared:  0.726 
    ## F-statistic: 66.98 on 10 and 239 DF,  p-value: < 2.2e-16

    Using the individual p-value method, the variables that need to be dropped are chest, waist, thigh, knee,ankle, bicep, forearm with ankle being the first to drop down due to its high p-value. However, to double check, the AIC criterion will also be considered.

    bf_step_back = step(bf_lm, direction = "backward",trace = FALSE)
    summary(bf_step_back)
    ## 
    ## Call:
    ## lm(formula = pct_bf ~ neck + chest + abdomen + hip + bicep + 
    ##     wrist, data = data_bf)
    ## 
    ## Residuals:
    ##    Min     1Q Median     3Q    Max 
    ## -9.668 -2.889 -0.361  3.210 11.148 
    ## 
    ## Coefficients:
    ##             Estimate Std. Error t value Pr(>|t|)    
    ## (Intercept)  1.52703    6.63727   0.230 0.818232    
    ## neck        -0.39650    0.22234  -1.783 0.075783 .  
    ## chest       -0.12810    0.08992  -1.425 0.155562    
    ## abdomen      1.01805    0.07431  13.700  < 2e-16 ***
    ## hip         -0.28758    0.09232  -3.115 0.002060 ** 
    ## bicep        0.26094    0.15160   1.721 0.086469 .  
    ## wrist       -1.55084    0.45510  -3.408 0.000767 ***
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## Residual standard error: 4.32 on 243 degrees of freedom
    ## Multiple R-squared:  0.7353, Adjusted R-squared:  0.7287 
    ## F-statistic: 112.5 on 6 and 243 DF,  p-value: < 2.2e-16

    Based on the backward selection model, the fitted model has become:

    $ = 1.52 -0.3965neck - 0.128chest + 1.01805abdomen -0.28758hip + 0.26bicep -1.55084wrist $

    Finally, to check assumption, we perform the ggfortify function.

    par(mfrow=c(1,2))
    plot(bf_step_back,which=1:2) + theme_bw()

    ## NULL

    The QQ plot shows a straight line which indicates that the normality assumption is reasonable. However, the residuals vs fitted plot shows a slight variation; but given that body fat is hard to predict, this is acceptable.

    <<<<<<< HEAD
    Final fitted model

    $ = 1.52 -0.3965neck - 0.128chest + 1.01805abdomen -0.28758hip + 0.26bicep -1.55084wrist $

    BMI

    For this analysis, the formula of BMI is \(BMI = \frac{Weight (lbs)*703}{Height(in)^2}\)

    Data Visualisation
    qtlcharts::iplotCorr(data_bmi)

    Based on the interactive correlation matrix, it can be seen the level of correlation differs quite drastically between the variables and the backward variable selection method is adopted.

    Multiple Regression
    bmi_lm = lm(bmi~.,data=data_bmi)
    summary(bmi_lm)
    ## 
    ## Call:
    ## lm(formula = bmi ~ ., data = data_bmi)
    ## 
    ## Residuals:
    ##     Min      1Q  Median      3Q     Max 
    ## -3.1538 -0.6529  0.0036  0.6464  3.7589 
    ## 
    ## Coefficients: (1 not defined because of singularities)
    ##               Estimate Std. Error t value Pr(>|t|)    
    ## (Intercept) -11.337205   1.667286  -6.800 8.32e-11 ***
    ## neck          0.031220   0.056196   0.556    0.579    
    ## chest         0.148829   0.022439   6.633 2.18e-10 ***
    ## abdomen       0.130813   0.018636   7.020 2.29e-11 ***
    ## waist               NA         NA      NA       NA    
    ## hip           0.048917   0.031149   1.570    0.118    
    ## thigh         0.135537   0.032679   4.147 4.67e-05 ***
    ## knee         -0.253557   0.055488  -4.570 7.84e-06 ***
    ## ankle         0.056067   0.053066   1.057    0.292    
    ## bicep         0.051276   0.041545   1.234    0.218    
    ## forearm       0.076917   0.050545   1.522    0.129    
    ## wrist         0.005644   0.123276   0.046    0.964    
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## Residual standard error: 1.059 on 239 degrees of freedom
    ## Multiple R-squared:  0.9039, Adjusted R-squared:  0.8998 
    ## F-statistic: 224.7 on 10 and 239 DF,  p-value: < 2.2e-16

    Using the individual p-value method, the variables that need to be dropped are hip, ankle, bicep, forearm and wrist. To double check, the AIC criterion will also be considered.

    bmi_step_back = step(bmi_lm, direction = "backward",trace = FALSE)
    summary(bmi_step_back)
    ## 
    ## Call:
    ## lm(formula = bmi ~ chest + abdomen + hip + thigh + knee + forearm, 
    ##     data = data_bmi)
    ## 
    ## Residuals:
    ##     Min      1Q  Median      3Q     Max 
    ## -3.1197 -0.6944 -0.0274  0.6831  3.8464 
    ## 
    ## Coefficients:
    ##              Estimate Std. Error t value Pr(>|t|)    
    ## (Intercept) -10.94257    1.43829  -7.608 6.10e-13 ***
    ## chest         0.16090    0.02122   7.582 7.18e-13 ***
    ## abdomen       0.12726    0.01826   6.968 3.01e-11 ***
    ## hip           0.05047    0.03084   1.637   0.1030    
    ## thigh         0.14983    0.03032   4.942 1.44e-06 ***
    ## knee         -0.23116    0.05148  -4.490 1.10e-05 ***
    ## forearm       0.11484    0.04468   2.571   0.0108 *  
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## Residual standard error: 1.058 on 243 degrees of freedom
    ## Multiple R-squared:  0.9024, Adjusted R-squared:    0.9 
    ## F-statistic: 374.6 on 6 and 243 DF,  p-value: < 2.2e-16

    2.3 Body Density

    Based on the backward selection model, the fitted model has become:

    $ = -10.94 +0.161chest + 0.127abdomen + 0.050hip + 0.150 thigh - 0.23knee + 0.115forearm $

    Finally, to check assumption, we perform the ggfortify function.

    par(mfrow=c(1,2))
    plot(bmi_step_back,which=1:2) + theme_bw()

    ## NULL

    The QQ plot shows a straight line which indicates that the normality assumption is reasonable. However, the residuals vs fitted plot shows a fan shaped plot which indicates that the assumption of homogeneous variance is violated. We can use a log transformed response and re-fit the linear regression.

    ln_bmi_lm = lm(log(bmi)~.,data=data_bmi)
    summary(ln_bmi_lm)
    ## 
    ## Call:
    ## lm(formula = log(bmi) ~ ., data = data_bmi)
    ## 
    ## Residuals:
    ##       Min        1Q    Median        3Q       Max 
    ## -0.131845 -0.026047  0.000653  0.027572  0.107675 
    ## 
    ## Coefficients: (1 not defined because of singularities)
    ##               Estimate Std. Error t value Pr(>|t|)    
    ## (Intercept)  1.7820756  0.0634046  28.106  < 2e-16 ***
    ## neck         0.0017372  0.0021370   0.813 0.417073    
    ## chest        0.0054462  0.0008533   6.382 8.98e-10 ***
    ## abdomen      0.0051190  0.0007087   7.223 6.76e-12 ***
    ## waist               NA         NA      NA       NA    
    ## hip          0.0004404  0.0011846   0.372 0.710402    
    ## thigh        0.0061100  0.0012427   4.917 1.64e-06 ***
    ## knee        -0.0076085  0.0021101  -3.606 0.000379 ***
    ## ankle        0.0020730  0.0020180   1.027 0.305333    
    ## bicep        0.0025240  0.0015799   1.598 0.111461    
    ## forearm      0.0033017  0.0019222   1.718 0.087150 .  
    ## wrist        0.0009228  0.0046880   0.197 0.844112    
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## Residual standard error: 0.04028 on 239 degrees of freedom
    ## Multiple R-squared:  0.9066, Adjusted R-squared:  0.9027 
    ## F-statistic:   232 on 10 and 239 DF,  p-value: < 2.2e-16
    ln_bmi_step_back = step(ln_bmi_lm, direction = "backward",trace = FALSE)
    summary(ln_bmi_step_back)
    ## 
    ## Call:
    ## lm(formula = log(bmi) ~ chest + abdomen + thigh + knee + bicep + 
    ##     forearm, data = data_bmi)
    ## 
    ## Residuals:
    ##       Min        1Q    Median        3Q       Max 
    ## -0.129144 -0.024844  0.000147  0.028553  0.111637 
    ## 
    ## Coefficients:
    ##               Estimate Std. Error t value Pr(>|t|)    
    ## (Intercept)  1.8276641  0.0495199  36.908  < 2e-16 ***
    ## chest        0.0057533  0.0008218   7.001 2.47e-11 ***
    ## abdomen      0.0051792  0.0006497   7.972 6.10e-14 ***
    ## thigh        0.0064286  0.0009988   6.436 6.48e-10 ***
    ## knee        -0.0064618  0.0018831  -3.431 0.000705 ***
    ## bicep        0.0028200  0.0015464   1.824 0.069436 .  
    ## forearm      0.0039923  0.0018360   2.174 0.030638 *  
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## Residual standard error: 0.04015 on 243 degrees of freedom
    ## Multiple R-squared:  0.9056, Adjusted R-squared:  0.9033 
    ## F-statistic: 388.7 on 6 and 243 DF,  p-value: < 2.2e-16
    par(mfrow=c(1,2))
    plot(ln_bmi_step_back,which=1:2) + theme_bw()

    ## NULL
    sjPlot::tab_model(bmi_step_back, ln_bmi_step_back, digits = 5, show.ci = FALSE)
      bmi log(bmi)
    Predictors Estimates p Estimates p
    (Intercept) -10.94257 <0.001 1.82766 <0.001
    chest 0.16090 <0.001 0.00575 <0.001
    abdomen 0.12726 <0.001 0.00518 <0.001
    hip 0.05047 0.103
    thigh 0.14983 <0.001 0.00643 <0.001
    knee -0.23116 <0.001 -0.00646 0.001
    forearm 0.11484 0.011 0.00399 0.031
    bicep 0.00282 0.069
    Observations 250 250
    R2 / R2 adjusted 0.902 / 0.900 0.906 / 0.903
    Final Fitted Model

    $log() = 1.83 +0.0058chest + 0.0052abdomen + 0.0064 thigh -0.0065knee + 0.0028bicep + 0.0040 forearm $.

    Conclusion

    sjPlot::tab_model(bf_step_back, ln_bmi_step_back, digits = 5, show.ci = FALSE)
      pct bf log(bmi)
    Predictors Estimates p Estimates p
    (Intercept) 1.52703 0.818 1.82766 <0.001
    neck -0.39650 0.076
    chest -0.12810 0.156 0.00575 <0.001
    abdomen 1.01805 <0.001 0.00518 <0.001
    hip -0.28758 0.002
    bicep 0.26094 0.086 0.00282 0.069
    wrist -1.55084 0.001
    thigh 0.00643 <0.001
    knee -0.00646 0.001
    forearm 0.00399 0.031
    Observations 250 250
    R2 / R2 adjusted 0.735 / 0.729 0.906 / 0.903

    Through looking at the two models, we can see that using simply body measurements, it is easier to predict changes in bmi rather than percentage body fat. From the models, measurements of abdomen appears to have the greatest influence on both body fat and bmi and hence any increase should be treated with caution.

    3. Limitations

    =======

    2.4 Age

    Predicting percentage of body fat from age

    Fit a simple linear regression to the data to assess whether the age has an influence on the percentage of body fat. Taking the log of % of body fat improves the fit by altering the scale and making the variable more “normally” distributed. \[ X = \beta_0 + \beta_1log(Y) + \epsilon \]

    p = data %>% ggplot() + aes(x = age, y = pct_bf) + geom_point() + 
        geom_smooth(method = "lm", se = FALSE) + theme_bw() +
        scale_y_continuous(labels = scales::number) + 
        scale_x_continuous(labels = scales::number) +
        labs(x = "Age", y = "Body Fat Percentage", title = "Proportion of Body Fat Percentage based on Age", fill = "Percentage of Body Fat", caption = "Source: SOCR Data BMI Regression") +
        scale_y_log10()
    ## Scale for 'y' is already present. Adding another scale for 'y', which
    ## will replace the existing scale.
    p
    ## Warning: Transformation introduced infinite values in continuous y-axis
    
    ## Warning: Transformation introduced infinite values in continuous y-axis
    ## Warning: Removed 1 rows containing non-finite values (stat_smooth).

    data.lm = lm(log1p(pct_bf) ~ age, data)
    summary(data.lm)
    ## 
    ## Call:
    ## lm(formula = log1p(pct_bf) ~ age, data = data)
    ## 
    ## Residuals:
    ##     Min      1Q  Median      3Q     Max 
    ## -2.8296 -0.2566  0.1179  0.3431  0.9251 
    ## 
    ## Coefficients:
    ##             Estimate Std. Error t value Pr(>|t|)    
    ## (Intercept) 2.368207   0.118541  19.978  < 2e-16 ***
    ## age         0.011535   0.002542   4.537 8.88e-06 ***
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## Residual standard error: 0.5076 on 248 degrees of freedom
    ## Multiple R-squared:  0.07665,    Adjusted R-squared:  0.07293 
    ## F-statistic: 20.59 on 1 and 248 DF,  p-value: 8.883e-06

    A one year increase in age would lead to a 1.15% increase in percentage of body fat.

    predict(data.lm, data = data.frame(x = 50), interval = "prediction", level = 0.95)
    ## Warning in predict.lm(data.lm, data = data.frame(x = 50), interval = "prediction", : predictions on current data refer to _future_ responses
    ##          fit      lwr      upr
    ## 1   2.633523 1.625837 3.641209
    ## 2   2.621988 1.613745 3.630231
    ## 3   2.621988 1.613745 3.630231
    ## 4   2.668130 1.661966 3.674293
    ## 5   2.645059 1.637905 3.652212
    ## 6   2.645059 1.637905 3.652212
    ## 7   2.668130 1.661966 3.674293
    ## 8   2.656594 1.649948 3.663240
    ## 9   2.656594 1.649948 3.663240
    ## 10  2.633523 1.625837 3.641209
    ## 11  2.668130 1.661966 3.674293
    ## 12  2.679665 1.673960 3.685370
    ## 13  2.737342 1.733557 3.741128
    ## 14  2.714271 1.709793 3.718750
    ## 15  2.771949 1.769017 3.774881
    ## 16  2.771949 1.769017 3.774881
    ## 17  2.760413 1.757222 3.763605
    ## 18  2.737342 1.733557 3.741128
    ## 19  2.691201 1.685929 3.696472
    ## 20  2.748878 1.745402 3.752354
    ## 21  2.691201 1.685929 3.696472
    ## 22  2.691201 1.685929 3.696472
    ## 23  2.725807 1.721687 3.729927
    ## 24  2.737342 1.733557 3.741128
    ## 25  2.691201 1.685929 3.696472
    ## 26  2.679665 1.673960 3.685370
    ## 27  2.760413 1.757222 3.763605
    ## 28  2.725807 1.721687 3.729927
    ## 29  2.679665 1.673960 3.685370
    ## 30  2.702736 1.697873 3.707599
    ## 31  2.737342 1.733557 3.741128
    ## 32  2.702736 1.697873 3.707599
    ## 33  2.679665 1.673960 3.685370
    ## 34  2.841162 1.839263 3.843061
    ## 35  2.841162 1.839263 3.843061
    ## 36  2.933445 1.931523 3.935368
    ## 37  2.829626 1.827618 3.831635
    ## 38  2.944981 1.942943 3.947019
    ## 39  2.944981 1.942943 3.947019
    ## 40  2.887304 1.885593 3.889014
    ## 41  2.921910 1.920078 3.923742
    ## 42  2.841162 1.839263 3.843061
    ## 43  2.818091 1.815947 3.820234
    ## 44  2.864233 1.862478 3.865987
    ## 45  2.829626 1.827618 3.831635
    ## 46  2.818091 1.815947 3.820234
    ## 47  2.887304 1.885593 3.889014
    ## 48  2.910375 1.908608 3.912141
    ## 49  2.910375 1.908608 3.912141
    ## 50  2.829626 1.827618 3.831635
    ## 51  2.956516 1.954338 3.958695
    ## 52  2.933445 1.931523 3.935368
    ## 53  2.852697 1.850883 3.854511
    ## 54  2.991123 1.988373 3.993873
    ## 55  3.037265 2.033404 4.041126
    ## 56  3.083407 2.078037 4.088777
    ## 57  2.991123 1.988373 3.993873
    ## 58  3.071871 2.066916 4.076827
    ## 59  3.083407 2.078037 4.088777
    ## 60  3.014194 2.010938 4.017449
    ## 61  2.991123 1.988373 3.993873
    ## 62  3.071871 2.066916 4.076827
    ## 63  3.025729 2.022184 4.029275
    ## 64  3.002658 1.999668 4.005648
    ## 65  2.991123 1.988373 3.993873
    ## 66  3.002658 1.999668 4.005648
    ## 67  2.991123 1.988373 3.993873
    ## 68  3.002658 1.999668 4.005648
    ## 69  3.083407 2.078037 4.088777
    ## 70  3.002658 1.999668 4.005648
    ## 71  3.014194 2.010938 4.017449
    ## 72  3.002658 1.999668 4.005648
    ## 73  3.071871 2.066916 4.076827
    ## 74  3.071871 2.066916 4.076827
    ## 75  3.025729 2.022184 4.029275
    ## 76  3.164155 2.155192 4.173118
    ## 77  3.302581 2.284677 4.320485
    ## 78  3.129549 2.122273 4.136824
    ## 79  3.141084 2.133271 4.148897
    ## 80  3.106478 2.100204 4.112751
    ## 81  3.106478 2.100204 4.112751
    ## 82  3.175690 2.166116 4.185265
    ## 83  3.198761 2.187891 4.209632
    ## 84  3.141084 2.133271 4.148897
    ## 85  3.198761 2.187891 4.209632
    ## 86  3.106478 2.100204 4.112751
    ## 87  2.898839 1.897113 3.900565
    ## 88  2.921910 1.920078 3.923742
    ## 89  2.898839 1.897113 3.900565
    ## 90  2.875768 1.874048 3.877488
    ## 91  2.910375 1.908608 3.912141
    ## 92  2.898839 1.897113 3.900565
    ## 93  2.910375 1.908608 3.912141
    ## 94  2.979587 1.977053 3.982122
    ## 95  2.806555 1.804252 3.808858
    ## 96  2.944981 1.942943 3.947019
    ## 97  2.898839 1.897113 3.900565
    ## 98  2.910375 1.908608 3.912141
    ## 99  2.933445 1.931523 3.935368
    ## 100 2.921910 1.920078 3.923742
    ## 101 2.841162 1.839263 3.843061
    ## 102 2.933445 1.931523 3.935368
    ## 103 2.864233 1.862478 3.865987
    ## 104 2.864233 1.862478 3.865987
    ## 105 2.864233 1.862478 3.865987
    ## 106 2.968052 1.965708 3.970396
    ## 107 2.864233 1.862478 3.865987
    ## 108 2.829626 1.827618 3.831635
    ## 109 2.864233 1.862478 3.865987
    ## 110 2.864233 1.862478 3.865987
    ## 111 2.910375 1.908608 3.912141
    ## 112 2.852697 1.850883 3.854511
    ## 113 2.921910 1.920078 3.923742
    ## 114 2.829626 1.827618 3.831635
    ## 115 2.921910 1.920078 3.923742
    ## 116 2.956516 1.954338 3.958695
    ## 117 2.829626 1.827618 3.831635
    ## 118 2.875768 1.874048 3.877488
    ## 119 2.968052 1.965708 3.970396
    ## 120 2.875768 1.874048 3.877488
    ## 121 2.829626 1.827618 3.831635
    ## 122 2.910375 1.908608 3.912141
    ## 123 2.944981 1.942943 3.947019
    ## 124 2.898839 1.897113 3.900565
    ## 125 2.852697 1.850883 3.854511
    ## 126 2.864233 1.862478 3.865987
    ## 127 2.829626 1.827618 3.831635
    ## 128 2.852697 1.850883 3.854511
    ## 129 2.933445 1.931523 3.935368
    ## 130 2.829626 1.827618 3.831635
    ## 131 2.910375 1.908608 3.912141
    ## 132 2.944981 1.942943 3.947019
    ## 133 2.841162 1.839263 3.843061
    ## 134 2.875768 1.874048 3.877488
    ## 135 2.818091 1.815947 3.820234
    ## 136 2.864233 1.862478 3.865987
    ## 137 2.829626 1.827618 3.831635
    ## 138 2.933445 1.931523 3.935368
    ## 139 2.829626 1.827618 3.831635
    ## 140 2.829626 1.827618 3.831635
    ## 141 2.968052 1.965708 3.970396
    ## 142 2.633523 1.625837 3.641209
    ## 143 2.633523 1.625837 3.641209
    ## 144 2.645059 1.637905 3.652212
    ## 145 2.645059 1.637905 3.652212
    ## 146 2.656594 1.649948 3.663240
    ## 147 2.656594 1.649948 3.663240
    ## 148 2.668130 1.661966 3.674293
    ## 149 2.668130 1.661966 3.674293
    ## 150 2.668130 1.661966 3.674293
    ## 151 2.679665 1.673960 3.685370
    ## 152 2.679665 1.673960 3.685370
    ## 153 2.679665 1.673960 3.685370
    ## 154 2.691201 1.685929 3.696472
    ## 155 2.691201 1.685929 3.696472
    ## 156 2.691201 1.685929 3.696472
    ## 157 2.714271 1.709793 3.718750
    ## 158 2.725807 1.721687 3.729927
    ## 159 2.725807 1.721687 3.729927
    ## 160 2.748878 1.745402 3.752354
    ## 161 2.748878 1.745402 3.752354
    ## 162 2.760413 1.757222 3.763605
    ## 163 2.760413 1.757222 3.763605
    ## 164 2.771949 1.769017 3.774881
    ## 165 2.771949 1.769017 3.774881
    ## 166 2.771949 1.769017 3.774881
    ## 167 2.771949 1.769017 3.774881
    ## 168 2.771949 1.769017 3.774881
    ## 169 2.771949 1.769017 3.774881
    ## 170 2.771949 1.769017 3.774881
    ## 171 2.771949 1.769017 3.774881
    ## 172 2.783484 1.780787 3.786182
    ## 173 2.783484 1.780787 3.786182
    ## 174 2.795020 1.792532 3.797508
    ## 175 2.795020 1.792532 3.797508
    ## 176 2.795020 1.792532 3.797508
    ## 177 2.806555 1.804252 3.808858
    ## 178 2.818091 1.815947 3.820234
    ## 179 2.818091 1.815947 3.820234
    ## 180 2.829626 1.827618 3.831635
    ## 181 2.829626 1.827618 3.831635
    ## 182 2.829626 1.827618 3.831635
    ## 183 2.829626 1.827618 3.831635
    ## 184 2.829626 1.827618 3.831635
    ## 185 2.841162 1.839263 3.843061
    ## 186 2.841162 1.839263 3.843061
    ## 187 2.841162 1.839263 3.843061
    ## 188 2.841162 1.839263 3.843061
    ## 189 2.841162 1.839263 3.843061
    ## 190 2.852697 1.850883 3.854511
    ## 191 2.852697 1.850883 3.854511
    ## 192 2.852697 1.850883 3.854511
    ## 193 2.852697 1.850883 3.854511
    ## 194 2.852697 1.850883 3.854511
    ## 195 2.852697 1.850883 3.854511
    ## 196 2.852697 1.850883 3.854511
    ## 197 2.852697 1.850883 3.854511
    ## 198 2.864233 1.862478 3.865987
    ## 199 2.864233 1.862478 3.865987
    ## 200 2.864233 1.862478 3.865987
    ## 201 2.864233 1.862478 3.865987
    ## 202 2.875768 1.874048 3.877488
    ## 203 2.875768 1.874048 3.877488
    ## 204 2.875768 1.874048 3.877488
    ## 205 2.875768 1.874048 3.877488
    ## 206 2.910375 1.908608 3.912141
    ## 207 2.910375 1.908608 3.912141
    ## 208 2.910375 1.908608 3.912141
    ## 209 2.933445 1.931523 3.935368
    ## 210 2.933445 1.931523 3.935368
    ## 211 2.933445 1.931523 3.935368
    ## 212 2.944981 1.942943 3.947019
    ## 213 2.944981 1.942943 3.947019
    ## 214 2.956516 1.954338 3.958695
    ## 215 2.956516 1.954338 3.958695
    ## 216 2.956516 1.954338 3.958695
    ## 217 2.968052 1.965708 3.970396
    ## 218 2.979587 1.977053 3.982122
    ## 219 2.991123 1.988373 3.993873
    ## 220 2.991123 1.988373 3.993873
    ## 221 2.991123 1.988373 3.993873
    ## 222 3.002658 1.999668 4.005648
    ## 223 3.002658 1.999668 4.005648
    ## 224 3.002658 1.999668 4.005648
    ## 225 3.002658 1.999668 4.005648
    ## 226 3.002658 1.999668 4.005648
    ## 227 3.014194 2.010938 4.017449
    ## 228 3.014194 2.010938 4.017449
    ## 229 3.025729 2.022184 4.029275
    ## 230 3.025729 2.022184 4.029275
    ## 231 3.037265 2.033404 4.041126
    ## 232 3.037265 2.033404 4.041126
    ## 233 3.060336 2.055770 4.064901
    ## 234 3.083407 2.078037 4.088777
    ## 235 3.083407 2.078037 4.088777
    ## 236 3.094942 2.089133 4.100751
    ## 237 3.106478 2.100204 4.112751
    ## 238 3.118013 2.111251 4.124775
    ## 239 3.118013 2.111251 4.124775
    ## 240 3.118013 2.111251 4.124775
    ## 241 3.129549 2.122273 4.136824
    ## 242 3.141084 2.133271 4.148897
    ## 243 3.141084 2.133271 4.148897
    ## 244 3.152619 2.144244 4.160995
    ## 245 3.164155 2.155192 4.173118
    ## 246 3.175690 2.166116 4.185265
    ## 247 3.198761 2.187891 4.209632
    ## 248 3.198761 2.187891 4.209632
    ## 249 3.198761 2.187891 4.209632
    ## 250 3.221832 2.209568 4.234097
    p + geom_segment(aes(y = 0, yend = 2.633523, x = 50, xend = 50), 
        colour = "gray") + geom_segment(aes(y = 2.633523, yend = 2.633523,
        x = 25, xend = 50), colour = "gray") + scale_x_continuous(limits = c(25, 75), expand = c(0, 0), labels = scales::number) + scale_y_continuous(limits = c(0, 50), expand = c(0, 0), labels = scales::number)
    ## Scale for 'x' is already present. Adding another scale for 'x', which
    ## will replace the existing scale.
    ## Scale for 'y' is already present. Adding another scale for 'y', which
    ## will replace the existing scale.
    ## Warning: Removed 11 rows containing non-finite values (stat_smooth).
    ## Warning: Removed 11 rows containing missing values (geom_point).

    par(mfrow = c(1, 2))
    plot(data.lm, which = 1:2)

    autoplot(data.lm, which = 1:2) + theme_bw()

    tlm = lm(pct_bf ~ log(age), data)
    summary(tlm)
    ## 
    ## Call:
    ## lm(formula = pct_bf ~ log(age), data = data)
    ## 
    ## Residuals:
    ##      Min       1Q   Median       3Q      Max 
    ## -18.4197  -6.2216   0.2241   5.2902  27.0649 
    ## 
    ## Coefficients:
    ##             Estimate Std. Error t value Pr(>|t|)    
    ## (Intercept)  -12.182      6.469  -1.883   0.0609 .  
    ## log(age)       8.296      1.714   4.840 2.29e-06 ***
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## Residual standard error: 7.944 on 248 degrees of freedom
    ## Multiple R-squared:  0.08629,    Adjusted R-squared:  0.08261 
    ## F-statistic: 23.42 on 1 and 248 DF,  p-value: 2.288e-06
    autoplot(tlm, which = 1:2) + theme_bw()

    #install.packages("tidyr")
    #install.packages("sjPlot")
    ttlm = lm(log1p(pct_bf) ~ log(age), data)
    sjPlot::tab_model(data.lm, tlm, ttlm, digits = 5, show.ci = FALSE)
      log 1 p(pct bf) pct bf log 1 p(pct bf)
    Predictors Estimates p Estimates p Estimates p
    (Intercept) 2.36821 <0.001 -12.18239 0.061 1.02224 0.014
    age 0.01154 <0.001
    log(age) 8.29576 <0.001 0.49532 <0.001
    Observations 250 250 250
    R2 / R2 adjusted 0.077 / 0.073 0.086 / 0.083 0.076 / 0.072
    2.4 LINEAR REGRESSION WITH AGE

    provide context to the qn!!! why use linear regression on full model…

    2.4.1 Defining the model with population parameters

    \[ Age = \beta_0 + \beta_1density + \beta_1pctBodyFat + \beta_2weight + \beta_3height + \beta_4neck + \beta_5chest + \\ \beta_6abdomen + \beta_7waist + \beta_8hip + \beta_9thigh + \beta_{10}knee + \beta_{11}ankle + \beta_{12}bicep + \\ \beta_{13}forearm + \beta_{14}wrist + \epsilon \]

    2.4.2 Linear regression assumptions for the full model - what is x and y?

    The residuals \(\epsilon_i\) are iid \(N(0,\sigma^2)\) and there is a linear relationship between y and x.

    age_lm = lm(age ~ ., data)
    summary(age_lm)
    ## 
    ## Call:
    ## lm(formula = age ~ ., data = data)
    ## 
    ## Residuals:
    ##      Min       1Q   Median       3Q      Max 
    ## -22.1205  -5.1938   0.1434   5.8938  22.0763 
    ## 
    ## Coefficients: (1 not defined because of singularities)
    ##              Estimate Std. Error t value Pr(>|t|)    
    ## (Intercept) -43.24027  245.61885  -0.176 0.860410    
    ## density     121.26738  187.99650   0.645 0.519526    
    ## pct_bf        0.57880    0.43458   1.332 0.184205    
    ## weight       -0.03828    0.31775  -0.120 0.904209    
    ## height       -2.13847    1.60109  -1.336 0.182966    
    ## neck          0.63745    0.47088   1.354 0.177120    
    ## chest         0.28603    0.21978   1.301 0.194395    
    ## abdomen       0.99400    0.20995   4.735 3.80e-06 ***
    ## waist              NA         NA      NA       NA    
    ## hip          -0.27058    0.30326  -0.892 0.373190    
    ## thigh        -1.54957    0.27784  -5.577 6.72e-08 ***
    ## knee          1.79738    0.48283   3.723 0.000247 ***
    ## ankle        -0.66679    0.44212  -1.508 0.132863    
    ## bicep         0.32423    0.34275   0.946 0.345148    
    ## forearm      -0.77606    0.41415  -1.874 0.062193 .  
    ## wrist         6.18153    1.02127   6.053 5.61e-09 ***
    ## bmi          -2.94254    2.25054  -1.307 0.192331    
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## Residual standard error: 8.514 on 234 degrees of freedom
    ## Multiple R-squared:  0.5744, Adjusted R-squared:  0.5472 
    ## F-statistic: 21.06 on 15 and 234 DF,  p-value: < 2.2e-16
    autoplot(age_lm, which = 1:2) + theme_bw()
    1. Linearity: In the scatterplot above, there is no obvious pattern between the residual and fitted values. Hence, we have not misspecified the model.
    2. Homoskedasticity: In the scatterplot above, the residuals do not appear to be fanning out or changing their variability over the range of the fitted values so the constant error variance assumption is met.
    3. Normality: In the QQ plot above, the points are reasonably close to the diagonal line. Approximately 6 of the points at the bottom do not lie on the line, but the departure is not severe enough to cause any concern. Therefore, we are confident that the normal assumption is at least approximately satisfied.
    2.4.3 Dropping variables using the AIC starting from the full model
    age_step = step(age_lm, direction = "backward")
    ## Start:  AIC=1086.32
    ## age ~ density + pct_bf + weight + height + neck + chest + abdomen + 
    ##     waist + hip + thigh + knee + ankle + bicep + forearm + wrist + 
    ##     bmi
    ## 
    ## 
    ## Step:  AIC=1086.32
    ## age ~ density + pct_bf + weight + height + neck + chest + abdomen + 
    ##     hip + thigh + knee + ankle + bicep + forearm + wrist + bmi
    ## 
    ##           Df Sum of Sq   RSS    AIC
    ## - weight   1      1.05 16963 1084.3
    ## - density  1     30.16 16992 1084.8
    ## - hip      1     57.71 17020 1085.2
    ## - bicep    1     64.87 17027 1085.3
    ## - chest    1    122.77 17085 1086.1
    ## - bmi      1    123.92 17086 1086.1
    ## - pct_bf   1    128.58 17091 1086.2
    ## - height   1    129.31 17092 1086.2
    ## - neck     1    132.85 17095 1086.3
    ## <none>                 16962 1086.3
    ## - ankle    1    164.88 17127 1086.7
    ## - forearm  1    254.54 17217 1088.0
    ## - knee     1   1004.53 17967 1098.7
    ## - abdomen  1   1624.90 18587 1107.2
    ## - thigh    1   2254.76 19217 1115.5
    ## - wrist    1   2655.73 19618 1120.7
    ## 
    ## Step:  AIC=1084.34
    ## age ~ density + pct_bf + height + neck + chest + abdomen + hip + 
    ##     thigh + knee + ankle + bicep + forearm + wrist + bmi
    ## 
    ##           Df Sum of Sq   RSS    AIC
    ## - density  1     29.21 16993 1082.8
    ## - hip      1     56.90 17020 1083.2
    ## - bicep    1     63.82 17027 1083.3
    ## - chest    1    121.76 17085 1084.1
    ## - pct_bf   1    127.54 17091 1084.2
    ## - neck     1    131.80 17095 1084.3
    ## <none>                 16963 1084.3
    ## - ankle    1    165.02 17128 1084.8
    ## - forearm  1    253.82 17217 1086.0
    ## - bmi      1    846.07 17810 1094.5
    ## - knee     1   1030.02 17994 1097.1
    ## - abdomen  1   1629.51 18593 1105.3
    ## - height   1   1677.56 18641 1105.9
    ## - thigh    1   2322.94 19286 1114.4
    ## - wrist    1   2658.31 19622 1118.7
    ## 
    ## Step:  AIC=1082.77
    ## age ~ pct_bf + height + neck + chest + abdomen + hip + thigh + 
    ##     knee + ankle + bicep + forearm + wrist + bmi
    ## 
    ##           Df Sum of Sq   RSS    AIC
    ## - hip      1     50.94 17044 1081.5
    ## - bicep    1     57.58 17050 1081.6
    ## - chest    1    134.22 17127 1082.7
    ## - neck     1    136.37 17129 1082.8
    ## <none>                 16993 1082.8
    ## - ankle    1    180.17 17173 1083.4
    ## - forearm  1    255.59 17248 1084.5
    ## - pct_bf   1    425.59 17418 1087.0
    ## - bmi      1    848.35 17841 1093.0
    ## - knee     1   1032.06 18025 1095.5
    ## - abdomen  1   1601.63 18594 1103.3
    ## - height   1   1667.12 18660 1104.2
    ## - thigh    1   2366.73 19359 1113.4
    ## - wrist    1   2730.53 19723 1118.0
    ## 
    ## Step:  AIC=1081.52
    ## age ~ pct_bf + height + neck + chest + abdomen + thigh + knee + 
    ##     ankle + bicep + forearm + wrist + bmi
    ## 
    ##           Df Sum of Sq   RSS    AIC
    ## - bicep    1     67.40 17111 1080.5
    ## <none>                 17044 1081.5
    ## - ankle    1    167.14 17211 1082.0
    ## - chest    1    175.85 17219 1082.1
    ## - neck     1    182.79 17226 1082.2
    ## - forearm  1    243.33 17287 1083.1
    ## - pct_bf   1    462.70 17506 1086.2
    ## - knee     1   1001.01 18045 1093.8
    ## - bmi      1   1417.80 18461 1099.5
    ## - abdomen  1   1550.95 18594 1101.3
    ## - wrist    1   2753.58 19797 1117.0
    ## - height   1   2764.06 19808 1117.1
    ## - thigh    1   3012.53 20056 1120.2
    ## 
    ## Step:  AIC=1080.5
    ## age ~ pct_bf + height + neck + chest + abdomen + thigh + knee + 
    ##     ankle + forearm + wrist + bmi
    ## 
    ##           Df Sum of Sq   RSS    AIC
    ## <none>                 17111 1080.5
    ## - ankle    1    178.13 17289 1081.1
    ## - forearm  1    192.84 17304 1081.3
    ## - chest    1    194.85 17306 1081.3
    ## - neck     1    209.14 17320 1081.5
    ## - pct_bf   1    493.44 17604 1085.6
    ## - knee     1    988.36 18099 1092.5
    ## - bmi      1   1359.87 18471 1097.6
    ## - abdomen  1   1487.35 18598 1099.3
    ## - height   1   2700.19 19811 1115.1
    ## - wrist    1   2841.19 19952 1116.9
    ## - thigh    1   2964.74 20076 1118.5

    Backwards selection using the AIC dropped variables waist, height, density, bicep and hip but decided to keep chest in the model.

    2.4.4 Fitted model for the model selected by the step-wise procedure.

    \[ Age = -74.36348 + 0.3153 \times pctBodyFat -0.4782 \times weight + 0.8185 \times neck\\ + 0.32544 \times chest + 0.8824 \times abdomen -1.6054 \times thigh + 1.8424 \times knee\\ - 0.7486 \times ankle -0.6929 \times forearm + 6.2789 \times wrist \] Looking at the \(R^2\) value (multiple R-squared) from the summary output, 50% of the variability of age is explained by the regression on percentage of body fat, weight, neck, chest, abdomen, thigh, knee, ankle, forearm and wrist.

    1. On average, holding the other variables constant, a 1% increase in body fat leads to a 0.31 unit increase in age.
    2. On average, holding the other variables constant, a 1 year increase in age leads to a 0.31% increase in bodyFat.
    summary(age_step)
    ## 
    ## Call:
    ## lm(formula = age ~ pct_bf + height + neck + chest + abdomen + 
    ##     thigh + knee + ankle + forearm + wrist + bmi, data = data)
    ## 
    ## Residuals:
    ##      Min       1Q   Median       3Q      Max 
    ## -22.0458  -5.2040   0.2848   5.6562  21.6013 
    ## 
    ## Coefficients:
    ##             Estimate Std. Error t value Pr(>|t|)    
    ## (Intercept)  98.0750    16.1779   6.062 5.22e-09 ***
    ## pct_bf        0.3330     0.1271   2.620  0.00936 ** 
    ## height       -2.4808     0.4048  -6.128 3.65e-09 ***
    ## neck          0.7733     0.4534   1.706  0.08939 .  
    ## chest         0.3496     0.2123   1.646  0.10103    
    ## abdomen       0.9131     0.2008   4.548 8.62e-06 ***
    ## thigh        -1.5923     0.2480  -6.422 7.25e-10 ***
    ## knee          1.7430     0.4701   3.708  0.00026 ***
    ## ankle        -0.6872     0.4366  -1.574  0.11680    
    ## forearm      -0.6485     0.3960  -1.638  0.10279    
    ## wrist         6.3441     1.0092   6.286 1.54e-09 ***
    ## bmi          -3.4753     0.7991  -4.349 2.03e-05 ***
    ## ---
    ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    ## 
    ## Residual standard error: 8.479 on 238 degrees of freedom
    ## Multiple R-squared:  0.5707, Adjusted R-squared:  0.5509 
    ## F-statistic: 28.77 on 11 and 238 DF,  p-value: < 2.2e-16
    2.4.5 Linear regression assumptions for the stepwise model - why do this? same as previous??
    autoplot(age_step, which = 1:2) + theme_bw()

    3. Limitations

    >>>>>>> 62e0a8412e27ea0e3d0247c40f04dfa9ef0c7a0d

    4. Conclusion

    5. References